operator fixes: (#834)
- just pass cid from operator for consistency, don't load crawl from update_crawl (different object) - don't throw in update_config_crawl_stats() to avoid exception in operator, only throw in crawlconfigs api
This commit is contained in:
parent
f992704491
commit
b40d599e17
@ -622,7 +622,11 @@ class CrawlConfigOps:
|
|||||||
|
|
||||||
async def update_crawl_stats(self, cid: uuid.UUID):
|
async def update_crawl_stats(self, cid: uuid.UUID):
|
||||||
"""Update crawl count, total size, and last crawl information for config."""
|
"""Update crawl count, total size, and last crawl information for config."""
|
||||||
await update_config_crawl_stats(self.crawl_configs, self.crawls, cid)
|
result = await update_config_crawl_stats(self.crawl_configs, self.crawls, cid)
|
||||||
|
if not result:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404, detail=f"Crawl Config '{cid}' not found to update"
|
||||||
|
)
|
||||||
|
|
||||||
def _add_curr_crawl_stats(self, crawlconfig, crawl):
|
def _add_curr_crawl_stats(self, crawlconfig, crawl):
|
||||||
"""Add stats from current running crawl, if any"""
|
"""Add stats from current running crawl, if any"""
|
||||||
@ -921,11 +925,6 @@ async def update_config_crawl_stats(crawl_configs, crawls, cid: uuid.UUID):
|
|||||||
return_document=pymongo.ReturnDocument.AFTER,
|
return_document=pymongo.ReturnDocument.AFTER,
|
||||||
)
|
)
|
||||||
|
|
||||||
if not result:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=404, detail=f"Crawl Config '{cid}' not found to update"
|
|
||||||
)
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@ -139,6 +139,7 @@ class BtrixOperator(K8sAPI):
|
|||||||
|
|
||||||
spec = data.parent.get("spec", {})
|
spec = data.parent.get("spec", {})
|
||||||
crawl_id = spec["id"]
|
crawl_id = spec["id"]
|
||||||
|
cid = spec["cid"]
|
||||||
|
|
||||||
scale = spec.get("scale", 1)
|
scale = spec.get("scale", 1)
|
||||||
status.scale = scale
|
status.scale = scale
|
||||||
@ -149,21 +150,19 @@ class BtrixOperator(K8sAPI):
|
|||||||
if data.finalizing:
|
if data.finalizing:
|
||||||
# if not yet finished, assume it was canceled, mark as such
|
# if not yet finished, assume it was canceled, mark as such
|
||||||
if not status.finished:
|
if not status.finished:
|
||||||
await self.cancel_crawl(redis_url, crawl_id, status, "canceled")
|
await self.cancel_crawl(redis_url, crawl_id, cid, status, "canceled")
|
||||||
|
|
||||||
return await self.finalize_crawl(crawl_id, status, data.related)
|
return await self.finalize_crawl(crawl_id, status, data.related)
|
||||||
|
|
||||||
if status.finished:
|
if status.finished:
|
||||||
return await self.handle_finished_delete_if_needed(crawl_id, status, spec)
|
return await self.handle_finished_delete_if_needed(crawl_id, status, spec)
|
||||||
|
|
||||||
cid = spec["cid"]
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"]
|
configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"]
|
||||||
# pylint: disable=bare-except, broad-except
|
# pylint: disable=bare-except, broad-except
|
||||||
except:
|
except:
|
||||||
# fail crawl if config somehow missing, shouldn't generally happen
|
# fail crawl if config somehow missing, shouldn't generally happen
|
||||||
await self.cancel_crawl(redis_url, crawl_id, status, "failed")
|
await self.cancel_crawl(redis_url, crawl_id, cid, status, "failed")
|
||||||
|
|
||||||
return self._done_response(status)
|
return self._done_response(status)
|
||||||
|
|
||||||
@ -278,10 +277,10 @@ class BtrixOperator(K8sAPI):
|
|||||||
print("PVC Delete failed", exc, flush=True)
|
print("PVC Delete failed", exc, flush=True)
|
||||||
|
|
||||||
# pylint: disable=too-many-arguments
|
# pylint: disable=too-many-arguments
|
||||||
async def cancel_crawl(self, redis_url, crawl_id, status, state):
|
async def cancel_crawl(self, redis_url, crawl_id, cid, status, state):
|
||||||
"""immediately cancel crawl with specified state"""
|
"""immediately cancel crawl with specified state"""
|
||||||
redis = await self._get_redis(redis_url)
|
redis = await self._get_redis(redis_url)
|
||||||
await self.mark_finished(redis, crawl_id, status, state)
|
await self.mark_finished(redis, crawl_id, cid, status, state)
|
||||||
|
|
||||||
def _done_response(self, status, finalized=False):
|
def _done_response(self, status, finalized=False):
|
||||||
"""done response for removing crawl"""
|
"""done response for removing crawl"""
|
||||||
@ -421,25 +420,29 @@ class BtrixOperator(K8sAPI):
|
|||||||
# check if one-page crawls actually succeeded
|
# check if one-page crawls actually succeeded
|
||||||
# if only one page found, and no files, assume failed
|
# if only one page found, and no files, assume failed
|
||||||
if status.pagesFound == 1 and not status.filesAdded:
|
if status.pagesFound == 1 and not status.filesAdded:
|
||||||
return await self.mark_finished(redis, crawl.id, status, state="failed")
|
return await self.mark_finished(
|
||||||
|
redis, crawl.id, crawl.cid, status, state="failed"
|
||||||
|
)
|
||||||
|
|
||||||
completed = status.pagesDone and status.pagesDone >= status.pagesFound
|
completed = status.pagesDone and status.pagesDone >= status.pagesFound
|
||||||
|
|
||||||
state = "complete" if completed else "partial_complete"
|
state = "complete" if completed else "partial_complete"
|
||||||
|
|
||||||
status = await self.mark_finished(
|
status = await self.mark_finished(
|
||||||
redis, crawl.id, status, state, crawl, stats
|
redis, crawl.id, crawl.cid, status, state, crawl, stats
|
||||||
)
|
)
|
||||||
|
|
||||||
# check if all crawlers failed
|
# check if all crawlers failed
|
||||||
if failed >= crawl.scale:
|
if failed >= crawl.scale:
|
||||||
status = await self.mark_finished(redis, crawl.id, status, state="failed")
|
status = await self.mark_finished(
|
||||||
|
redis, crawl.id, crawl.cid, status, state="failed"
|
||||||
|
)
|
||||||
|
|
||||||
return status
|
return status
|
||||||
|
|
||||||
# pylint: disable=too-many-arguments
|
# pylint: disable=too-many-arguments
|
||||||
async def mark_finished(
|
async def mark_finished(
|
||||||
self, redis, crawl_id, status, state, crawl=None, stats=None
|
self, redis, crawl_id, cid, status, state, crawl=None, stats=None
|
||||||
):
|
):
|
||||||
"""mark crawl as finished, set finished timestamp and final state"""
|
"""mark crawl as finished, set finished timestamp and final state"""
|
||||||
finished = dt_now()
|
finished = dt_now()
|
||||||
@ -448,10 +451,9 @@ class BtrixOperator(K8sAPI):
|
|||||||
if stats:
|
if stats:
|
||||||
kwargs["stats"] = stats
|
kwargs["stats"] = stats
|
||||||
|
|
||||||
crawl = await update_crawl(self.crawls, crawl_id, **kwargs)
|
await update_crawl(self.crawls, crawl_id, **kwargs)
|
||||||
crawl_cid = crawl.get("cid")
|
|
||||||
|
|
||||||
await update_config_crawl_stats(self.crawl_configs, self.crawls, crawl_cid)
|
await update_config_crawl_stats(self.crawl_configs, self.crawls, cid)
|
||||||
|
|
||||||
if redis:
|
if redis:
|
||||||
await self.add_crawl_errors_to_db(redis, crawl_id)
|
await self.add_crawl_errors_to_db(redis, crawl_id)
|
||||||
|
@ -67,10 +67,12 @@ def test_get_configs_by_description(
|
|||||||
assert config["description"] == description
|
assert config["description"] == description
|
||||||
|
|
||||||
|
|
||||||
def test_get_configs_by_schedule_true(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
def test_get_configs_by_schedule_true(
|
||||||
|
crawler_auth_headers, default_org_id, crawler_crawl_id
|
||||||
|
):
|
||||||
r = requests.get(
|
r = requests.get(
|
||||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?schedule=True",
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?schedule=True",
|
||||||
headers=crawler_auth_headers
|
headers=crawler_auth_headers,
|
||||||
)
|
)
|
||||||
data = r.json()
|
data = r.json()
|
||||||
assert data["total"] == 1
|
assert data["total"] == 1
|
||||||
@ -78,10 +80,12 @@ def test_get_configs_by_schedule_true(crawler_auth_headers, default_org_id, craw
|
|||||||
assert workflow.get("schedule") not in ("", None)
|
assert workflow.get("schedule") not in ("", None)
|
||||||
|
|
||||||
|
|
||||||
def test_get_configs_by_schedule_false(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
def test_get_configs_by_schedule_false(
|
||||||
|
crawler_auth_headers, default_org_id, crawler_crawl_id
|
||||||
|
):
|
||||||
r = requests.get(
|
r = requests.get(
|
||||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?schedule=False",
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?schedule=False",
|
||||||
headers=crawler_auth_headers
|
headers=crawler_auth_headers,
|
||||||
)
|
)
|
||||||
data = r.json()
|
data = r.json()
|
||||||
assert data["total"] >= 1
|
assert data["total"] >= 1
|
||||||
|
Loading…
Reference in New Issue
Block a user