From a1df689729ce59588906f8c4b70ea3ec849a4112 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@users.noreply.github.com>
Date: Mon, 26 Aug 2024 14:18:59 -0700
Subject: [PATCH] stats recompute fixes: (#2022)

- fix stats_recompute_last() and stats_recompute_all() to not update the
lastCrawl* properties of a crawl workflow if a crawl is running, as
those stats now point to the running crawl
- refactor _add_running_curr_crawl_stats() to make it clear stats only
updated if crawl is running
- stats_recompute_all() change order to ascending to actually get last
crawl, not first!
---
 backend/btrixcloud/crawlconfigs.py | 80 +++++++++++++-----------------
 1 file changed, 35 insertions(+), 45 deletions(-)

diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
index d6175c0a..d9846fd2 100644
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@@ -529,7 +529,7 @@ class CrawlConfigOps:
             config = CrawlConfigOut.from_dict(res)
             # pylint: disable=invalid-name
             if not config.inactive:
-                self._add_curr_crawl_stats(config, await self.get_running_crawl(config))
+                await self._add_running_curr_crawl_stats(config)
             configs.append(config)
 
         return configs, total
@@ -554,14 +554,10 @@ class CrawlConfigOps:
 
         return results
 
-    async def get_running_crawl(
-        self, crawlconfig: Union[CrawlConfig, CrawlConfigOut]
-    ) -> Optional[CrawlOut]:
+    async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]:
         """Return the id of currently running crawl for this config, if any"""
         # crawls = await self.crawl_manager.list_running_crawls(cid=crawlconfig.id)
-        crawls, _ = await self.crawl_ops.list_crawls(
-            cid=crawlconfig.id, running_only=True
-        )
+        crawls, _ = await self.crawl_ops.list_crawls(cid=cid, running_only=True)
 
         if len(crawls) == 1:
             return crawls[0]
@@ -570,21 +566,22 @@ class CrawlConfigOps:
 
     async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
         """recompute stats by incrementing size counter and number of crawls"""
-        update_query: dict[str, object] = {
-            "lastCrawlId": None,
-            "lastCrawlStartTime": None,
-            "lastStartedBy": None,
-            "lastCrawlTime": None,
-            "lastCrawlState": None,
-            "lastCrawlSize": None,
-            "lastCrawlStopping": False,
-            "isCrawlRunning": False,
-        }
+        update_query: dict[str, object] = {}
 
-        match_query = {"cid": cid, "finished": {"$ne": None}, "inactive": {"$ne": True}}
-        last_crawl = await self.crawls.find_one(
-            match_query, sort=[("finished", pymongo.DESCENDING)]
-        )
+        running_crawl = await self.get_running_crawl(cid)
+        # only look up last finished crawl if no crawls running, otherwise
+        # lastCrawl* stats are already for running crawl
+        if not running_crawl:
+            match_query = {
+                "cid": cid,
+                "finished": {"$ne": None},
+                "inactive": {"$ne": True},
+            }
+            last_crawl = await self.crawls.find_one(
+                match_query, sort=[("finished", pymongo.DESCENDING)]
+            )
+        else:
+            last_crawl = None
 
         if last_crawl:
             last_crawl_finished = last_crawl.get("finished")
@@ -598,6 +595,8 @@ class CrawlConfigOps:
             update_query["lastCrawlSize"] = sum(
                 file_.get("size", 0) for file_ in last_crawl.get("files", [])
             )
+            update_query["lastCrawlStopping"] = False
+            update_query["isCrawlRunning"] = False
 
             if last_crawl_finished:
                 update_query["lastRun"] = last_crawl_finished
@@ -616,16 +615,16 @@ class CrawlConfigOps:
 
         return result is not None
 
-    def _add_curr_crawl_stats(
-        self, crawlconfig: CrawlConfigOut, crawl: Optional[CrawlOut]
-    ):
+    async def _add_running_curr_crawl_stats(self, crawlconfig: CrawlConfigOut):
         """Add stats from current running crawl, if any"""
+        crawl = await self.get_running_crawl(crawlconfig.id)
         if not crawl:
             return
 
         crawlconfig.lastCrawlState = crawl.state
         crawlconfig.lastCrawlSize = crawl.stats.size if crawl.stats else 0
         crawlconfig.lastCrawlStopping = crawl.stopping
+        crawlconfig.isCrawlRunning = True
 
     async def get_crawl_config_out(self, cid: UUID, org: Organization):
         """Return CrawlConfigOut, including state of currently running crawl, if active
@@ -636,9 +635,7 @@ class CrawlConfigOps:
         )
 
         if not crawlconfig.inactive:
-            self._add_curr_crawl_stats(
-                crawlconfig, await self.get_running_crawl(crawlconfig)
-            )
+            await self._add_running_curr_crawl_stats(crawlconfig)
 
         if crawlconfig.profileid:
             crawlconfig.profileName = await self.profiles.get_profile_name(
@@ -715,7 +712,7 @@ class CrawlConfigOps:
 
         query = {"inactive": True}
 
-        is_running = await self.get_running_crawl(crawlconfig) is not None
+        is_running = await self.get_running_crawl(crawlconfig.id) is not None
 
         if is_running:
             raise HTTPException(status_code=400, detail="crawl_running_cant_deactivate")
@@ -829,7 +826,7 @@ class CrawlConfigOps:
         """run new crawl for specified crawlconfig now"""
         self.org_ops.can_write_data(org)
 
-        if await self.get_running_crawl(crawlconfig):
+        if await self.get_running_crawl(crawlconfig.id):
             raise HTTPException(status_code=400, detail="crawl_already_running")
 
         profile_filename = await self.get_profile_filename(crawlconfig.profileid, org)
@@ -924,20 +921,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
     Should only be called when a crawl completes from operator or on migration
     when no crawls are running.
     """
-    update_query: dict[str, object] = {
-        "crawlCount": 0,
-        "crawlSuccessfulCount": 0,
-        "totalSize": 0,
-        "lastCrawlId": None,
-        "lastCrawlStartTime": None,
-        "lastStartedBy": None,
-        "lastStartedByName": None,
-        "lastCrawlTime": None,
-        "lastCrawlState": None,
-        "lastCrawlSize": None,
-        "lastCrawlStopping": False,
-        "isCrawlRunning": False,
-    }
+    update_query: dict[str, object] = {}
 
     match_query = {"cid": cid, "finished": {"$ne": None}}
     count = await crawls.count_documents(match_query)
@@ -950,7 +934,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
         last_crawl: Optional[dict[str, object]] = None
         last_crawl_size = 0
 
-        async for res in crawls.find(match_query).sort("finished", pymongo.DESCENDING):
+        async for res in crawls.find(match_query).sort("finished", pymongo.ASCENDING):
             files = res.get("files", [])
             crawl_size = 0
             for file in files:
@@ -964,7 +948,11 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
             last_crawl = res
             last_crawl_size = crawl_size
 
-        if last_crawl:
+        # only update last_crawl if no crawls running, otherwise
+        # lastCrawl* stats are already for running crawl
+        running_crawl = await crawl_configs.get_running_crawl(cid)
+
+        if last_crawl and not running_crawl:
             update_query["totalSize"] = total_size
             update_query["crawlSuccessfulCount"] = successful_count
 
@@ -974,6 +962,8 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
             update_query["lastStartedByName"] = last_crawl.get("userName")
             update_query["lastCrawlState"] = last_crawl.get("state")
             update_query["lastCrawlSize"] = last_crawl_size
+            update_query["lastCrawlStopping"] = False
+            update_query["isCrawlRunning"] = False
 
             last_crawl_finished = last_crawl.get("finished")
             update_query["lastCrawlTime"] = last_crawl_finished