From a1df689729ce59588906f8c4b70ea3ec849a4112 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 26 Aug 2024 14:18:59 -0700 Subject: [PATCH] stats recompute fixes: (#2022) - fix stats_recompute_last() and stats_recompute_all() to not update the lastCrawl* properties of a crawl workflow if a crawl is running, as those stats now point to the running crawl - refactor _add_running_curr_crawl_stats() to make it clear stats only updated if crawl is running - stats_recompute_all() change order to ascending to actually get last crawl, not first! --- backend/btrixcloud/crawlconfigs.py | 80 +++++++++++++----------------- 1 file changed, 35 insertions(+), 45 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index d6175c0a..d9846fd2 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -529,7 +529,7 @@ class CrawlConfigOps: config = CrawlConfigOut.from_dict(res) # pylint: disable=invalid-name if not config.inactive: - self._add_curr_crawl_stats(config, await self.get_running_crawl(config)) + await self._add_running_curr_crawl_stats(config) configs.append(config) return configs, total @@ -554,14 +554,10 @@ class CrawlConfigOps: return results - async def get_running_crawl( - self, crawlconfig: Union[CrawlConfig, CrawlConfigOut] - ) -> Optional[CrawlOut]: + async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]: """Return the id of currently running crawl for this config, if any""" # crawls = await self.crawl_manager.list_running_crawls(cid=crawlconfig.id) - crawls, _ = await self.crawl_ops.list_crawls( - cid=crawlconfig.id, running_only=True - ) + crawls, _ = await self.crawl_ops.list_crawls(cid=cid, running_only=True) if len(crawls) == 1: return crawls[0] @@ -570,21 +566,22 @@ class CrawlConfigOps: async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1): """recompute stats by incrementing size counter and number of crawls""" - update_query: dict[str, object] = { - "lastCrawlId": None, - "lastCrawlStartTime": None, - "lastStartedBy": None, - "lastCrawlTime": None, - "lastCrawlState": None, - "lastCrawlSize": None, - "lastCrawlStopping": False, - "isCrawlRunning": False, - } + update_query: dict[str, object] = {} - match_query = {"cid": cid, "finished": {"$ne": None}, "inactive": {"$ne": True}} - last_crawl = await self.crawls.find_one( - match_query, sort=[("finished", pymongo.DESCENDING)] - ) + running_crawl = await self.get_running_crawl(cid) + # only look up last finished crawl if no crawls running, otherwise + # lastCrawl* stats are already for running crawl + if not running_crawl: + match_query = { + "cid": cid, + "finished": {"$ne": None}, + "inactive": {"$ne": True}, + } + last_crawl = await self.crawls.find_one( + match_query, sort=[("finished", pymongo.DESCENDING)] + ) + else: + last_crawl = None if last_crawl: last_crawl_finished = last_crawl.get("finished") @@ -598,6 +595,8 @@ class CrawlConfigOps: update_query["lastCrawlSize"] = sum( file_.get("size", 0) for file_ in last_crawl.get("files", []) ) + update_query["lastCrawlStopping"] = False + update_query["isCrawlRunning"] = False if last_crawl_finished: update_query["lastRun"] = last_crawl_finished @@ -616,16 +615,16 @@ class CrawlConfigOps: return result is not None - def _add_curr_crawl_stats( - self, crawlconfig: CrawlConfigOut, crawl: Optional[CrawlOut] - ): + async def _add_running_curr_crawl_stats(self, crawlconfig: CrawlConfigOut): """Add stats from current running crawl, if any""" + crawl = await self.get_running_crawl(crawlconfig.id) if not crawl: return crawlconfig.lastCrawlState = crawl.state crawlconfig.lastCrawlSize = crawl.stats.size if crawl.stats else 0 crawlconfig.lastCrawlStopping = crawl.stopping + crawlconfig.isCrawlRunning = True async def get_crawl_config_out(self, cid: UUID, org: Organization): """Return CrawlConfigOut, including state of currently running crawl, if active @@ -636,9 +635,7 @@ class CrawlConfigOps: ) if not crawlconfig.inactive: - self._add_curr_crawl_stats( - crawlconfig, await self.get_running_crawl(crawlconfig) - ) + await self._add_running_curr_crawl_stats(crawlconfig) if crawlconfig.profileid: crawlconfig.profileName = await self.profiles.get_profile_name( @@ -715,7 +712,7 @@ class CrawlConfigOps: query = {"inactive": True} - is_running = await self.get_running_crawl(crawlconfig) is not None + is_running = await self.get_running_crawl(crawlconfig.id) is not None if is_running: raise HTTPException(status_code=400, detail="crawl_running_cant_deactivate") @@ -829,7 +826,7 @@ class CrawlConfigOps: """run new crawl for specified crawlconfig now""" self.org_ops.can_write_data(org) - if await self.get_running_crawl(crawlconfig): + if await self.get_running_crawl(crawlconfig.id): raise HTTPException(status_code=400, detail="crawl_already_running") profile_filename = await self.get_profile_filename(crawlconfig.profileid, org) @@ -924,20 +921,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID): Should only be called when a crawl completes from operator or on migration when no crawls are running. """ - update_query: dict[str, object] = { - "crawlCount": 0, - "crawlSuccessfulCount": 0, - "totalSize": 0, - "lastCrawlId": None, - "lastCrawlStartTime": None, - "lastStartedBy": None, - "lastStartedByName": None, - "lastCrawlTime": None, - "lastCrawlState": None, - "lastCrawlSize": None, - "lastCrawlStopping": False, - "isCrawlRunning": False, - } + update_query: dict[str, object] = {} match_query = {"cid": cid, "finished": {"$ne": None}} count = await crawls.count_documents(match_query) @@ -950,7 +934,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID): last_crawl: Optional[dict[str, object]] = None last_crawl_size = 0 - async for res in crawls.find(match_query).sort("finished", pymongo.DESCENDING): + async for res in crawls.find(match_query).sort("finished", pymongo.ASCENDING): files = res.get("files", []) crawl_size = 0 for file in files: @@ -964,7 +948,11 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID): last_crawl = res last_crawl_size = crawl_size - if last_crawl: + # only update last_crawl if no crawls running, otherwise + # lastCrawl* stats are already for running crawl + running_crawl = await crawl_configs.get_running_crawl(cid) + + if last_crawl and not running_crawl: update_query["totalSize"] = total_size update_query["crawlSuccessfulCount"] = successful_count @@ -974,6 +962,8 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID): update_query["lastStartedByName"] = last_crawl.get("userName") update_query["lastCrawlState"] = last_crawl.get("state") update_query["lastCrawlSize"] = last_crawl_size + update_query["lastCrawlStopping"] = False + update_query["isCrawlRunning"] = False last_crawl_finished = last_crawl.get("finished") update_query["lastCrawlTime"] = last_crawl_finished