operator fixes: (#834)

- just pass cid from operator for consistency, don't load crawl from update_crawl (different object)
- don't throw in update_config_crawl_stats() to avoid exception in operator, only throw in crawlconfigs api
This commit is contained in:
Ilya Kreymer 2023-05-06 13:02:33 -07:00 committed by GitHub
parent f992704491
commit b40d599e17
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 28 additions and 23 deletions

View File

@ -622,7 +622,11 @@ class CrawlConfigOps:
async def update_crawl_stats(self, cid: uuid.UUID): async def update_crawl_stats(self, cid: uuid.UUID):
"""Update crawl count, total size, and last crawl information for config.""" """Update crawl count, total size, and last crawl information for config."""
await update_config_crawl_stats(self.crawl_configs, self.crawls, cid) result = await update_config_crawl_stats(self.crawl_configs, self.crawls, cid)
if not result:
raise HTTPException(
status_code=404, detail=f"Crawl Config '{cid}' not found to update"
)
def _add_curr_crawl_stats(self, crawlconfig, crawl): def _add_curr_crawl_stats(self, crawlconfig, crawl):
"""Add stats from current running crawl, if any""" """Add stats from current running crawl, if any"""
@ -921,11 +925,6 @@ async def update_config_crawl_stats(crawl_configs, crawls, cid: uuid.UUID):
return_document=pymongo.ReturnDocument.AFTER, return_document=pymongo.ReturnDocument.AFTER,
) )
if not result:
raise HTTPException(
status_code=404, detail=f"Crawl Config '{cid}' not found to update"
)
return result return result

View File

@ -139,6 +139,7 @@ class BtrixOperator(K8sAPI):
spec = data.parent.get("spec", {}) spec = data.parent.get("spec", {})
crawl_id = spec["id"] crawl_id = spec["id"]
cid = spec["cid"]
scale = spec.get("scale", 1) scale = spec.get("scale", 1)
status.scale = scale status.scale = scale
@ -149,21 +150,19 @@ class BtrixOperator(K8sAPI):
if data.finalizing: if data.finalizing:
# if not yet finished, assume it was canceled, mark as such # if not yet finished, assume it was canceled, mark as such
if not status.finished: if not status.finished:
await self.cancel_crawl(redis_url, crawl_id, status, "canceled") await self.cancel_crawl(redis_url, crawl_id, cid, status, "canceled")
return await self.finalize_crawl(crawl_id, status, data.related) return await self.finalize_crawl(crawl_id, status, data.related)
if status.finished: if status.finished:
return await self.handle_finished_delete_if_needed(crawl_id, status, spec) return await self.handle_finished_delete_if_needed(crawl_id, status, spec)
cid = spec["cid"]
try: try:
configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"] configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"]
# pylint: disable=bare-except, broad-except # pylint: disable=bare-except, broad-except
except: except:
# fail crawl if config somehow missing, shouldn't generally happen # fail crawl if config somehow missing, shouldn't generally happen
await self.cancel_crawl(redis_url, crawl_id, status, "failed") await self.cancel_crawl(redis_url, crawl_id, cid, status, "failed")
return self._done_response(status) return self._done_response(status)
@ -278,10 +277,10 @@ class BtrixOperator(K8sAPI):
print("PVC Delete failed", exc, flush=True) print("PVC Delete failed", exc, flush=True)
# pylint: disable=too-many-arguments # pylint: disable=too-many-arguments
async def cancel_crawl(self, redis_url, crawl_id, status, state): async def cancel_crawl(self, redis_url, crawl_id, cid, status, state):
"""immediately cancel crawl with specified state""" """immediately cancel crawl with specified state"""
redis = await self._get_redis(redis_url) redis = await self._get_redis(redis_url)
await self.mark_finished(redis, crawl_id, status, state) await self.mark_finished(redis, crawl_id, cid, status, state)
def _done_response(self, status, finalized=False): def _done_response(self, status, finalized=False):
"""done response for removing crawl""" """done response for removing crawl"""
@ -421,25 +420,29 @@ class BtrixOperator(K8sAPI):
# check if one-page crawls actually succeeded # check if one-page crawls actually succeeded
# if only one page found, and no files, assume failed # if only one page found, and no files, assume failed
if status.pagesFound == 1 and not status.filesAdded: if status.pagesFound == 1 and not status.filesAdded:
return await self.mark_finished(redis, crawl.id, status, state="failed") return await self.mark_finished(
redis, crawl.id, crawl.cid, status, state="failed"
)
completed = status.pagesDone and status.pagesDone >= status.pagesFound completed = status.pagesDone and status.pagesDone >= status.pagesFound
state = "complete" if completed else "partial_complete" state = "complete" if completed else "partial_complete"
status = await self.mark_finished( status = await self.mark_finished(
redis, crawl.id, status, state, crawl, stats redis, crawl.id, crawl.cid, status, state, crawl, stats
) )
# check if all crawlers failed # check if all crawlers failed
if failed >= crawl.scale: if failed >= crawl.scale:
status = await self.mark_finished(redis, crawl.id, status, state="failed") status = await self.mark_finished(
redis, crawl.id, crawl.cid, status, state="failed"
)
return status return status
# pylint: disable=too-many-arguments # pylint: disable=too-many-arguments
async def mark_finished( async def mark_finished(
self, redis, crawl_id, status, state, crawl=None, stats=None self, redis, crawl_id, cid, status, state, crawl=None, stats=None
): ):
"""mark crawl as finished, set finished timestamp and final state""" """mark crawl as finished, set finished timestamp and final state"""
finished = dt_now() finished = dt_now()
@ -448,10 +451,9 @@ class BtrixOperator(K8sAPI):
if stats: if stats:
kwargs["stats"] = stats kwargs["stats"] = stats
crawl = await update_crawl(self.crawls, crawl_id, **kwargs) await update_crawl(self.crawls, crawl_id, **kwargs)
crawl_cid = crawl.get("cid")
await update_config_crawl_stats(self.crawl_configs, self.crawls, crawl_cid) await update_config_crawl_stats(self.crawl_configs, self.crawls, cid)
if redis: if redis:
await self.add_crawl_errors_to_db(redis, crawl_id) await self.add_crawl_errors_to_db(redis, crawl_id)

View File

@ -67,10 +67,12 @@ def test_get_configs_by_description(
assert config["description"] == description assert config["description"] == description
def test_get_configs_by_schedule_true(crawler_auth_headers, default_org_id, crawler_crawl_id): def test_get_configs_by_schedule_true(
crawler_auth_headers, default_org_id, crawler_crawl_id
):
r = requests.get( r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?schedule=True", f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?schedule=True",
headers=crawler_auth_headers headers=crawler_auth_headers,
) )
data = r.json() data = r.json()
assert data["total"] == 1 assert data["total"] == 1
@ -78,10 +80,12 @@ def test_get_configs_by_schedule_true(crawler_auth_headers, default_org_id, craw
assert workflow.get("schedule") not in ("", None) assert workflow.get("schedule") not in ("", None)
def test_get_configs_by_schedule_false(crawler_auth_headers, default_org_id, crawler_crawl_id): def test_get_configs_by_schedule_false(
crawler_auth_headers, default_org_id, crawler_crawl_id
):
r = requests.get( r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?schedule=False", f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?schedule=False",
headers=crawler_auth_headers headers=crawler_auth_headers,
) )
data = r.json() data = r.json()
assert data["total"] >= 1 assert data["total"] >= 1