Ensure Volumes are deleted when crawl is canceled (#828)

* operator:
- ensures crawler pvcs are always deleted before crawl object is finalized (fixes #827)
- refactor to ensure finalizer handler always run when finalizing
- remove obsolete config entries
This commit is contained in:
Ilya Kreymer 2023-05-05 12:05:54 -07:00 committed by GitHub
parent 48d34bc3c4
commit aae0e6590e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 66 additions and 44 deletions

View File

@ -24,10 +24,6 @@ class CrawlManager(K8sAPI):
self.job_image = os.environ["JOB_IMAGE"]
self.job_image_pull_policy = os.environ.get("JOB_PULL_POLICY", "Always")
self.no_delete_jobs = os.environ.get("NO_DELETE_JOBS", "0") != "0"
self.crawler_node_type = os.environ.get("CRAWLER_NODE_TYPE", "")
self.cron_namespace = os.environ.get("CRON_NAMESPACE", "default")
self._default_storages = {}

View File

@ -30,6 +30,7 @@ from .crawls import (
STS = "StatefulSet.apps/v1"
CMAP = "ConfigMap.v1"
PVC = "PersistentVolumeClaim.v1"
DEFAULT_TTL = 30
@ -140,18 +141,29 @@ class BtrixOperator(K8sAPI):
scale = spec.get("scale", 1)
status.scale = scale
redis_url = self.get_redis_url(crawl_id)
# if finalizing, crawl is being deleted
if data.finalizing:
# if not yet finished, assume it was canceled, mark as such
if not status.finished:
await self.cancel_crawl(redis_url, crawl_id, status, "canceled")
return await self.finalize_crawl(crawl_id, status, data.related)
if status.finished:
return await self.handle_finished_delete_if_needed(crawl_id, status, spec)
cid = spec["cid"]
redis_url = self.get_redis_url(crawl_id)
try:
configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"]
# pylint: disable=bare-except, broad-except
except:
return await self.cancel_crawl(redis_url, crawl_id, status, "failed")
# fail crawl if config somehow missing, shouldn't generally happen
await self.cancel_crawl(redis_url, crawl_id, status, "failed")
return self._done_response(status)
crawl = CrawlSpec(
id=crawl_id,
@ -165,10 +177,6 @@ class BtrixOperator(K8sAPI):
expire_time=from_k8s_date(spec.get("expireTime")),
)
# if finalizing and not finished, job is being deleted, so assume crawl has been canceled
if data.finalizing:
return await self.cancel_crawl(redis_url, crawl_id, status, "canceled")
crawl_sts = f"crawl-{crawl_id}"
redis_sts = f"redis-{crawl_id}"
@ -179,7 +187,7 @@ class BtrixOperator(K8sAPI):
status.state = "starting"
if status.finished:
return await self.handle_finished_delete_if_needed(crawl.id, status, spec)
return await self.handle_finished_delete_if_needed(crawl_id, status, spec)
params = {}
params.update(self.shared_params)
@ -223,14 +231,20 @@ class BtrixOperator(K8sAPI):
def get_related(self, data: MCBaseRequest):
"""return configmap related to crawl"""
spec = data.parent.get("spec", {})
cid = spec.get("cid")
cid = spec["cid"]
crawl_id = spec["id"]
return {
"relatedResources": [
{
"apiVersion": "v1",
"resource": "configmaps",
"labelSelector": {"matchLabels": {"btrix.crawlconfig": cid}},
}
},
{
"apiVersion": "v1",
"resource": "persistentvolumeclaims",
"labelSelector": {"matchLabels": {"crawl": crawl_id}},
},
]
}
@ -241,17 +255,15 @@ class BtrixOperator(K8sAPI):
ttl = spec.get("ttlSecondsAfterFinished", DEFAULT_TTL)
finished = from_k8s_date(status.finished)
if (dt_now() - finished).total_seconds() > ttl:
if (dt_now() - finished).total_seconds() > ttl > 0:
print("Job expired, deleting: " + crawl_id)
asyncio.create_task(self.delete_crawl_job(crawl_id))
return self._done_response(status)
async def delete_crawl_job(self, crawl_id):
# delete the crawljob itself
await super().delete_crawl_job(crawl_id)
async def delete_pvc(self, crawl_id):
"""delete all pvcs for crawl"""
# until delete policy is supported in StatefulSet
# now, delete pvcs explicitly
# (don't want to make them children as already owned by sts)
@ -263,20 +275,34 @@ class BtrixOperator(K8sAPI):
except Exception as exc:
print("PVC Delete failed", exc, flush=True)
# pylint: disable=too-many-arguments
async def cancel_crawl(self, redis_url, crawl_id, status, state):
"""immediately cancel crawl with specified state"""
redis = await self._get_redis(redis_url)
await self.mark_finished(redis, crawl_id, status, state)
return self._done_response(status)
def _done_response(self, status):
"""response for when crawl job is done/to be deleted"""
def _done_response(self, status, finalized=False):
"""done response for removing crawl"""
return {
"status": status.dict(exclude_none=True),
"children": [],
"finalized": True,
"finalized": finalized,
}
async def finalize_crawl(self, crawl_id, status, related):
"""ensure crawl id ready for deletion
return with finalized state"""
pvcs = list(related[PVC].keys())
if pvcs:
print("Deleting PVCs", pvcs)
await self.delete_pvc(crawl_id)
finalized = False
else:
finalized = True
return self._done_response(status, finalized)
async def _get_redis(self, redis_url):
"""init redis, ensure connectivity"""
redis = None
@ -447,20 +473,29 @@ class BtrixOperator(K8sAPI):
async def add_crawl_errors_to_db(self, redis, crawl_id, inc=100):
"""Pull crawl errors from redis and write to mongo db"""
index = 0
while True:
skip = index * inc
upper_bound = skip + inc - 1
errors = await redis.lrange(f"{crawl_id}:e", skip, upper_bound)
if not errors:
break
try:
# ensure this only runs once
if not await redis.setnx("errors-exported", "1"):
return
await add_crawl_errors(self.crawls, crawl_id, errors)
while True:
skip = index * inc
upper_bound = skip + inc - 1
errors = await redis.lrange(f"{crawl_id}:e", skip, upper_bound)
if not errors:
break
if len(errors) < inc:
# If we have fewer than inc errors, we can assume this is the
# last page of data to add.
break
index += 1
await add_crawl_errors(self.crawls, crawl_id, errors)
if len(errors) < inc:
# If we have fewer than inc errors, we can assume this is the
# last page of data to add.
break
index += 1
# likely redis has already been deleted, so nothing to do
# pylint: disable=bare-except
except:
return
# ============================================================================

View File

@ -38,14 +38,10 @@ data:
CRAWLER_PV_CLAIM: "{{ .Values.crawler_pv_claim }}"
{{- end }}
CRAWLER_NODE_TYPE: "{{ .Values.crawler_node_type }}"
REDIS_URL: "{{ .Values.redis_url }}"
REDIS_CRAWLS_DONE_KEY: "crawls-done"
NO_DELETE_JOBS: "{{ .Values.no_delete_jobs | default 0 }}"
GRACE_PERIOD_SECS: "{{ .Values.grace_period_secs | default 600 }}"
REGISTRATION_ENABLED: "{{ .Values.registration_enabled | default 0 }}"

View File

@ -180,11 +180,6 @@ crawler_liveness_port: 6065
grace_period: 1000
# debug
no_delete_jobs: 0
# Local Minio Pod (optional)
# =========================================
# set to true to use a local minio image