stats recompute fixes: (#2022)

- fix stats_recompute_last() and stats_recompute_all() to not update the lastCrawl* properties of a crawl workflow if a crawl is running, as those stats now point to the running crawl - refactor _add_running_curr_crawl_stats() to make it clear stats only updated if crawl is running - stats_recompute_all() change order to ascending to actually get last crawl, not first!
2024-08-26 14:18:59 -07:00 · 2024-08-26 14:18:59 -07:00 · a1df689729
commit a1df689729
parent 135c97419d
1 changed files with 35 additions and 45 deletions
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@ -529,7 +529,7 @@ class CrawlConfigOps:
            config = CrawlConfigOut.from_dict(res)
            # pylint: disable=invalid-name
            if not config.inactive:
-                self._add_curr_crawl_stats(config, await self.get_running_crawl(config))
+                await self._add_running_curr_crawl_stats(config)
            configs.append(config)

        return configs, total
@ -554,14 +554,10 @@ class CrawlConfigOps:

        return results

-    async def get_running_crawl(
-        self, crawlconfig: Union[CrawlConfig, CrawlConfigOut]
-    ) -> Optional[CrawlOut]:
+    async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]:
        """Return the id of currently running crawl for this config, if any"""
        # crawls = await self.crawl_manager.list_running_crawls(cid=crawlconfig.id)
-        crawls, _ = await self.crawl_ops.list_crawls(
-            cid=crawlconfig.id, running_only=True
-        )
+        crawls, _ = await self.crawl_ops.list_crawls(cid=cid, running_only=True)

        if len(crawls) == 1:
            return crawls[0]
@ -570,21 +566,22 @@ class CrawlConfigOps:

    async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
        """recompute stats by incrementing size counter and number of crawls"""
-        update_query: dict[str, object] = {
-            "lastCrawlId": None,
-            "lastCrawlStartTime": None,
-            "lastStartedBy": None,
-            "lastCrawlTime": None,
-            "lastCrawlState": None,
-            "lastCrawlSize": None,
-            "lastCrawlStopping": False,
-            "isCrawlRunning": False,
-        }
+        update_query: dict[str, object] = {}

-        match_query = {"cid": cid, "finished": {"$ne": None}, "inactive": {"$ne": True}}
-        last_crawl = await self.crawls.find_one(
-            match_query, sort=[("finished", pymongo.DESCENDING)]
-        )
+        running_crawl = await self.get_running_crawl(cid)
+        # only look up last finished crawl if no crawls running, otherwise
+        # lastCrawl* stats are already for running crawl
+        if not running_crawl:
+            match_query = {
+                "cid": cid,
+                "finished": {"$ne": None},
+                "inactive": {"$ne": True},
+            }
+            last_crawl = await self.crawls.find_one(
+                match_query, sort=[("finished", pymongo.DESCENDING)]
+            )
+        else:
+            last_crawl = None

        if last_crawl:
            last_crawl_finished = last_crawl.get("finished")
@ -598,6 +595,8 @@ class CrawlConfigOps:
            update_query["lastCrawlSize"] = sum(
                file_.get("size", 0) for file_ in last_crawl.get("files", [])
            )
+            update_query["lastCrawlStopping"] = False
+            update_query["isCrawlRunning"] = False

            if last_crawl_finished:
                update_query["lastRun"] = last_crawl_finished
@ -616,16 +615,16 @@ class CrawlConfigOps:

        return result is not None

-    def _add_curr_crawl_stats(
-        self, crawlconfig: CrawlConfigOut, crawl: Optional[CrawlOut]
-    ):
+    async def _add_running_curr_crawl_stats(self, crawlconfig: CrawlConfigOut):
        """Add stats from current running crawl, if any"""
+        crawl = await self.get_running_crawl(crawlconfig.id)
        if not crawl:
            return

        crawlconfig.lastCrawlState = crawl.state
        crawlconfig.lastCrawlSize = crawl.stats.size if crawl.stats else 0
        crawlconfig.lastCrawlStopping = crawl.stopping
+        crawlconfig.isCrawlRunning = True

    async def get_crawl_config_out(self, cid: UUID, org: Organization):
        """Return CrawlConfigOut, including state of currently running crawl, if active
@ -636,9 +635,7 @@ class CrawlConfigOps:
        )

        if not crawlconfig.inactive:
-            self._add_curr_crawl_stats(
-                crawlconfig, await self.get_running_crawl(crawlconfig)
-            )
+            await self._add_running_curr_crawl_stats(crawlconfig)

        if crawlconfig.profileid:
            crawlconfig.profileName = await self.profiles.get_profile_name(
@ -715,7 +712,7 @@ class CrawlConfigOps:

        query = {"inactive": True}

-        is_running = await self.get_running_crawl(crawlconfig) is not None
+        is_running = await self.get_running_crawl(crawlconfig.id) is not None

        if is_running:
            raise HTTPException(status_code=400, detail="crawl_running_cant_deactivate")
@ -829,7 +826,7 @@ class CrawlConfigOps:
        """run new crawl for specified crawlconfig now"""
        self.org_ops.can_write_data(org)

-        if await self.get_running_crawl(crawlconfig):
+        if await self.get_running_crawl(crawlconfig.id):
            raise HTTPException(status_code=400, detail="crawl_already_running")

        profile_filename = await self.get_profile_filename(crawlconfig.profileid, org)
@ -924,20 +921,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
    Should only be called when a crawl completes from operator or on migration
    when no crawls are running.
    """
-    update_query: dict[str, object] = {
-        "crawlCount": 0,
-        "crawlSuccessfulCount": 0,
-        "totalSize": 0,
-        "lastCrawlId": None,
-        "lastCrawlStartTime": None,
-        "lastStartedBy": None,
-        "lastStartedByName": None,
-        "lastCrawlTime": None,
-        "lastCrawlState": None,
-        "lastCrawlSize": None,
-        "lastCrawlStopping": False,
-        "isCrawlRunning": False,
-    }
+    update_query: dict[str, object] = {}

    match_query = {"cid": cid, "finished": {"$ne": None}}
    count = await crawls.count_documents(match_query)
@ -950,7 +934,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
        last_crawl: Optional[dict[str, object]] = None
        last_crawl_size = 0

-        async for res in crawls.find(match_query).sort("finished", pymongo.DESCENDING):
+        async for res in crawls.find(match_query).sort("finished", pymongo.ASCENDING):
            files = res.get("files", [])
            crawl_size = 0
            for file in files:
@ -964,7 +948,11 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
            last_crawl = res
            last_crawl_size = crawl_size

-        if last_crawl:
+        # only update last_crawl if no crawls running, otherwise
+        # lastCrawl* stats are already for running crawl
+        running_crawl = await crawl_configs.get_running_crawl(cid)
+
+        if last_crawl and not running_crawl:
            update_query["totalSize"] = total_size
            update_query["crawlSuccessfulCount"] = successful_count

@ -974,6 +962,8 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
            update_query["lastStartedByName"] = last_crawl.get("userName")
            update_query["lastCrawlState"] = last_crawl.get("state")
            update_query["lastCrawlSize"] = last_crawl_size
+            update_query["lastCrawlStopping"] = False
+            update_query["isCrawlRunning"] = False

            last_crawl_finished = last_crawl.get("finished")
            update_query["lastCrawlTime"] = last_crawl_finished