operator fixes: (#834)

- just pass cid from operator for consistency, don't load crawl from update_crawl (different object) - don't throw in update_config_crawl_stats() to avoid exception in operator, only throw in crawlconfigs api
2023-05-06 13:02:33 -07:00 · 2023-05-06 13:02:33 -07:00 · b40d599e17
commit b40d599e17
parent f992704491
3 changed files with 28 additions and 23 deletions
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@ -622,7 +622,11 @@ class CrawlConfigOps:

    async def update_crawl_stats(self, cid: uuid.UUID):
        """Update crawl count, total size, and last crawl information for config."""
-        await update_config_crawl_stats(self.crawl_configs, self.crawls, cid)
+        result = await update_config_crawl_stats(self.crawl_configs, self.crawls, cid)
+        if not result:
+            raise HTTPException(
+                status_code=404, detail=f"Crawl Config '{cid}' not found to update"
+            )

    def _add_curr_crawl_stats(self, crawlconfig, crawl):
        """Add stats from current running crawl, if any"""
@ -921,11 +925,6 @@ async def update_config_crawl_stats(crawl_configs, crawls, cid: uuid.UUID):
        return_document=pymongo.ReturnDocument.AFTER,
    )

-    if not result:
-        raise HTTPException(
-            status_code=404, detail=f"Crawl Config '{cid}' not found to update"
-        )
-
    return result


--- a/backend/btrixcloud/operator.py
+++ b/backend/btrixcloud/operator.py
@ -139,6 +139,7 @@ class BtrixOperator(K8sAPI):

        spec = data.parent.get("spec", {})
        crawl_id = spec["id"]
+        cid = spec["cid"]

        scale = spec.get("scale", 1)
        status.scale = scale
@ -149,21 +150,19 @@ class BtrixOperator(K8sAPI):
        if data.finalizing:
            # if not yet finished, assume it was canceled, mark as such
            if not status.finished:
-                await self.cancel_crawl(redis_url, crawl_id, status, "canceled")
+                await self.cancel_crawl(redis_url, crawl_id, cid, status, "canceled")

            return await self.finalize_crawl(crawl_id, status, data.related)

        if status.finished:
            return await self.handle_finished_delete_if_needed(crawl_id, status, spec)

-        cid = spec["cid"]
-
        try:
            configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"]
        # pylint: disable=bare-except, broad-except
        except:
            # fail crawl if config somehow missing, shouldn't generally happen
-            await self.cancel_crawl(redis_url, crawl_id, status, "failed")
+            await self.cancel_crawl(redis_url, crawl_id, cid, status, "failed")

            return self._done_response(status)

@ -278,10 +277,10 @@ class BtrixOperator(K8sAPI):
            print("PVC Delete failed", exc, flush=True)

    # pylint: disable=too-many-arguments
-    async def cancel_crawl(self, redis_url, crawl_id, status, state):
+    async def cancel_crawl(self, redis_url, crawl_id, cid, status, state):
        """immediately cancel crawl with specified state"""
        redis = await self._get_redis(redis_url)
-        await self.mark_finished(redis, crawl_id, status, state)
+        await self.mark_finished(redis, crawl_id, cid, status, state)

    def _done_response(self, status, finalized=False):
        """done response for removing crawl"""
@ -421,25 +420,29 @@ class BtrixOperator(K8sAPI):
            # check if one-page crawls actually succeeded
            # if only one page found, and no files, assume failed
            if status.pagesFound == 1 and not status.filesAdded:
-                return await self.mark_finished(redis, crawl.id, status, state="failed")
+                return await self.mark_finished(
+                    redis, crawl.id, crawl.cid, status, state="failed"
+                )

            completed = status.pagesDone and status.pagesDone >= status.pagesFound

            state = "complete" if completed else "partial_complete"

            status = await self.mark_finished(
-                redis, crawl.id, status, state, crawl, stats
+                redis, crawl.id, crawl.cid, status, state, crawl, stats
            )

        # check if all crawlers failed
        if failed >= crawl.scale:
-            status = await self.mark_finished(redis, crawl.id, status, state="failed")
+            status = await self.mark_finished(
+                redis, crawl.id, crawl.cid, status, state="failed"
+            )

        return status

    # pylint: disable=too-many-arguments
    async def mark_finished(
-        self, redis, crawl_id, status, state, crawl=None, stats=None
+        self, redis, crawl_id, cid, status, state, crawl=None, stats=None
    ):
        """mark crawl as finished, set finished timestamp and final state"""
        finished = dt_now()
@ -448,10 +451,9 @@ class BtrixOperator(K8sAPI):
        if stats:
            kwargs["stats"] = stats

-        crawl = await update_crawl(self.crawls, crawl_id, **kwargs)
-        crawl_cid = crawl.get("cid")
+        await update_crawl(self.crawls, crawl_id, **kwargs)

-        await update_config_crawl_stats(self.crawl_configs, self.crawls, crawl_cid)
+        await update_config_crawl_stats(self.crawl_configs, self.crawls, cid)

        if redis:
            await self.add_crawl_errors_to_db(redis, crawl_id)
--- a/backend/test/test_filter_sort_results.py
+++ b/backend/test/test_filter_sort_results.py
@ -67,10 +67,12 @@ def test_get_configs_by_description(
        assert config["description"] == description


-def test_get_configs_by_schedule_true(crawler_auth_headers, default_org_id, crawler_crawl_id):
+def test_get_configs_by_schedule_true(
+    crawler_auth_headers, default_org_id, crawler_crawl_id
+):
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?schedule=True",
-        headers=crawler_auth_headers
+        headers=crawler_auth_headers,
    )
    data = r.json()
    assert data["total"] == 1
@ -78,10 +80,12 @@ def test_get_configs_by_schedule_true(crawler_auth_headers, default_org_id, craw
    assert workflow.get("schedule") not in ("", None)


-def test_get_configs_by_schedule_false(crawler_auth_headers, default_org_id, crawler_crawl_id):
+def test_get_configs_by_schedule_false(
+    crawler_auth_headers, default_org_id, crawler_crawl_id
+):
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?schedule=False",
-        headers=crawler_auth_headers
+        headers=crawler_auth_headers,
    )
    data = r.json()
    assert data["total"] >= 1