stats recompute fixes: (#2022)
- fix stats_recompute_last() and stats_recompute_all() to not update the lastCrawl* properties of a crawl workflow if a crawl is running, as those stats now point to the running crawl - refactor _add_running_curr_crawl_stats() to make it clear stats only updated if crawl is running - stats_recompute_all() change order to ascending to actually get last crawl, not first!
This commit is contained in:
parent
135c97419d
commit
a1df689729
@ -529,7 +529,7 @@ class CrawlConfigOps:
|
|||||||
config = CrawlConfigOut.from_dict(res)
|
config = CrawlConfigOut.from_dict(res)
|
||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
if not config.inactive:
|
if not config.inactive:
|
||||||
self._add_curr_crawl_stats(config, await self.get_running_crawl(config))
|
await self._add_running_curr_crawl_stats(config)
|
||||||
configs.append(config)
|
configs.append(config)
|
||||||
|
|
||||||
return configs, total
|
return configs, total
|
||||||
@ -554,14 +554,10 @@ class CrawlConfigOps:
|
|||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
async def get_running_crawl(
|
async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]:
|
||||||
self, crawlconfig: Union[CrawlConfig, CrawlConfigOut]
|
|
||||||
) -> Optional[CrawlOut]:
|
|
||||||
"""Return the id of currently running crawl for this config, if any"""
|
"""Return the id of currently running crawl for this config, if any"""
|
||||||
# crawls = await self.crawl_manager.list_running_crawls(cid=crawlconfig.id)
|
# crawls = await self.crawl_manager.list_running_crawls(cid=crawlconfig.id)
|
||||||
crawls, _ = await self.crawl_ops.list_crawls(
|
crawls, _ = await self.crawl_ops.list_crawls(cid=cid, running_only=True)
|
||||||
cid=crawlconfig.id, running_only=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if len(crawls) == 1:
|
if len(crawls) == 1:
|
||||||
return crawls[0]
|
return crawls[0]
|
||||||
@ -570,21 +566,22 @@ class CrawlConfigOps:
|
|||||||
|
|
||||||
async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
|
async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
|
||||||
"""recompute stats by incrementing size counter and number of crawls"""
|
"""recompute stats by incrementing size counter and number of crawls"""
|
||||||
update_query: dict[str, object] = {
|
update_query: dict[str, object] = {}
|
||||||
"lastCrawlId": None,
|
|
||||||
"lastCrawlStartTime": None,
|
|
||||||
"lastStartedBy": None,
|
|
||||||
"lastCrawlTime": None,
|
|
||||||
"lastCrawlState": None,
|
|
||||||
"lastCrawlSize": None,
|
|
||||||
"lastCrawlStopping": False,
|
|
||||||
"isCrawlRunning": False,
|
|
||||||
}
|
|
||||||
|
|
||||||
match_query = {"cid": cid, "finished": {"$ne": None}, "inactive": {"$ne": True}}
|
running_crawl = await self.get_running_crawl(cid)
|
||||||
last_crawl = await self.crawls.find_one(
|
# only look up last finished crawl if no crawls running, otherwise
|
||||||
match_query, sort=[("finished", pymongo.DESCENDING)]
|
# lastCrawl* stats are already for running crawl
|
||||||
)
|
if not running_crawl:
|
||||||
|
match_query = {
|
||||||
|
"cid": cid,
|
||||||
|
"finished": {"$ne": None},
|
||||||
|
"inactive": {"$ne": True},
|
||||||
|
}
|
||||||
|
last_crawl = await self.crawls.find_one(
|
||||||
|
match_query, sort=[("finished", pymongo.DESCENDING)]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
last_crawl = None
|
||||||
|
|
||||||
if last_crawl:
|
if last_crawl:
|
||||||
last_crawl_finished = last_crawl.get("finished")
|
last_crawl_finished = last_crawl.get("finished")
|
||||||
@ -598,6 +595,8 @@ class CrawlConfigOps:
|
|||||||
update_query["lastCrawlSize"] = sum(
|
update_query["lastCrawlSize"] = sum(
|
||||||
file_.get("size", 0) for file_ in last_crawl.get("files", [])
|
file_.get("size", 0) for file_ in last_crawl.get("files", [])
|
||||||
)
|
)
|
||||||
|
update_query["lastCrawlStopping"] = False
|
||||||
|
update_query["isCrawlRunning"] = False
|
||||||
|
|
||||||
if last_crawl_finished:
|
if last_crawl_finished:
|
||||||
update_query["lastRun"] = last_crawl_finished
|
update_query["lastRun"] = last_crawl_finished
|
||||||
@ -616,16 +615,16 @@ class CrawlConfigOps:
|
|||||||
|
|
||||||
return result is not None
|
return result is not None
|
||||||
|
|
||||||
def _add_curr_crawl_stats(
|
async def _add_running_curr_crawl_stats(self, crawlconfig: CrawlConfigOut):
|
||||||
self, crawlconfig: CrawlConfigOut, crawl: Optional[CrawlOut]
|
|
||||||
):
|
|
||||||
"""Add stats from current running crawl, if any"""
|
"""Add stats from current running crawl, if any"""
|
||||||
|
crawl = await self.get_running_crawl(crawlconfig.id)
|
||||||
if not crawl:
|
if not crawl:
|
||||||
return
|
return
|
||||||
|
|
||||||
crawlconfig.lastCrawlState = crawl.state
|
crawlconfig.lastCrawlState = crawl.state
|
||||||
crawlconfig.lastCrawlSize = crawl.stats.size if crawl.stats else 0
|
crawlconfig.lastCrawlSize = crawl.stats.size if crawl.stats else 0
|
||||||
crawlconfig.lastCrawlStopping = crawl.stopping
|
crawlconfig.lastCrawlStopping = crawl.stopping
|
||||||
|
crawlconfig.isCrawlRunning = True
|
||||||
|
|
||||||
async def get_crawl_config_out(self, cid: UUID, org: Organization):
|
async def get_crawl_config_out(self, cid: UUID, org: Organization):
|
||||||
"""Return CrawlConfigOut, including state of currently running crawl, if active
|
"""Return CrawlConfigOut, including state of currently running crawl, if active
|
||||||
@ -636,9 +635,7 @@ class CrawlConfigOps:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if not crawlconfig.inactive:
|
if not crawlconfig.inactive:
|
||||||
self._add_curr_crawl_stats(
|
await self._add_running_curr_crawl_stats(crawlconfig)
|
||||||
crawlconfig, await self.get_running_crawl(crawlconfig)
|
|
||||||
)
|
|
||||||
|
|
||||||
if crawlconfig.profileid:
|
if crawlconfig.profileid:
|
||||||
crawlconfig.profileName = await self.profiles.get_profile_name(
|
crawlconfig.profileName = await self.profiles.get_profile_name(
|
||||||
@ -715,7 +712,7 @@ class CrawlConfigOps:
|
|||||||
|
|
||||||
query = {"inactive": True}
|
query = {"inactive": True}
|
||||||
|
|
||||||
is_running = await self.get_running_crawl(crawlconfig) is not None
|
is_running = await self.get_running_crawl(crawlconfig.id) is not None
|
||||||
|
|
||||||
if is_running:
|
if is_running:
|
||||||
raise HTTPException(status_code=400, detail="crawl_running_cant_deactivate")
|
raise HTTPException(status_code=400, detail="crawl_running_cant_deactivate")
|
||||||
@ -829,7 +826,7 @@ class CrawlConfigOps:
|
|||||||
"""run new crawl for specified crawlconfig now"""
|
"""run new crawl for specified crawlconfig now"""
|
||||||
self.org_ops.can_write_data(org)
|
self.org_ops.can_write_data(org)
|
||||||
|
|
||||||
if await self.get_running_crawl(crawlconfig):
|
if await self.get_running_crawl(crawlconfig.id):
|
||||||
raise HTTPException(status_code=400, detail="crawl_already_running")
|
raise HTTPException(status_code=400, detail="crawl_already_running")
|
||||||
|
|
||||||
profile_filename = await self.get_profile_filename(crawlconfig.profileid, org)
|
profile_filename = await self.get_profile_filename(crawlconfig.profileid, org)
|
||||||
@ -924,20 +921,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
|
|||||||
Should only be called when a crawl completes from operator or on migration
|
Should only be called when a crawl completes from operator or on migration
|
||||||
when no crawls are running.
|
when no crawls are running.
|
||||||
"""
|
"""
|
||||||
update_query: dict[str, object] = {
|
update_query: dict[str, object] = {}
|
||||||
"crawlCount": 0,
|
|
||||||
"crawlSuccessfulCount": 0,
|
|
||||||
"totalSize": 0,
|
|
||||||
"lastCrawlId": None,
|
|
||||||
"lastCrawlStartTime": None,
|
|
||||||
"lastStartedBy": None,
|
|
||||||
"lastStartedByName": None,
|
|
||||||
"lastCrawlTime": None,
|
|
||||||
"lastCrawlState": None,
|
|
||||||
"lastCrawlSize": None,
|
|
||||||
"lastCrawlStopping": False,
|
|
||||||
"isCrawlRunning": False,
|
|
||||||
}
|
|
||||||
|
|
||||||
match_query = {"cid": cid, "finished": {"$ne": None}}
|
match_query = {"cid": cid, "finished": {"$ne": None}}
|
||||||
count = await crawls.count_documents(match_query)
|
count = await crawls.count_documents(match_query)
|
||||||
@ -950,7 +934,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
|
|||||||
last_crawl: Optional[dict[str, object]] = None
|
last_crawl: Optional[dict[str, object]] = None
|
||||||
last_crawl_size = 0
|
last_crawl_size = 0
|
||||||
|
|
||||||
async for res in crawls.find(match_query).sort("finished", pymongo.DESCENDING):
|
async for res in crawls.find(match_query).sort("finished", pymongo.ASCENDING):
|
||||||
files = res.get("files", [])
|
files = res.get("files", [])
|
||||||
crawl_size = 0
|
crawl_size = 0
|
||||||
for file in files:
|
for file in files:
|
||||||
@ -964,7 +948,11 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
|
|||||||
last_crawl = res
|
last_crawl = res
|
||||||
last_crawl_size = crawl_size
|
last_crawl_size = crawl_size
|
||||||
|
|
||||||
if last_crawl:
|
# only update last_crawl if no crawls running, otherwise
|
||||||
|
# lastCrawl* stats are already for running crawl
|
||||||
|
running_crawl = await crawl_configs.get_running_crawl(cid)
|
||||||
|
|
||||||
|
if last_crawl and not running_crawl:
|
||||||
update_query["totalSize"] = total_size
|
update_query["totalSize"] = total_size
|
||||||
update_query["crawlSuccessfulCount"] = successful_count
|
update_query["crawlSuccessfulCount"] = successful_count
|
||||||
|
|
||||||
@ -974,6 +962,8 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
|
|||||||
update_query["lastStartedByName"] = last_crawl.get("userName")
|
update_query["lastStartedByName"] = last_crawl.get("userName")
|
||||||
update_query["lastCrawlState"] = last_crawl.get("state")
|
update_query["lastCrawlState"] = last_crawl.get("state")
|
||||||
update_query["lastCrawlSize"] = last_crawl_size
|
update_query["lastCrawlSize"] = last_crawl_size
|
||||||
|
update_query["lastCrawlStopping"] = False
|
||||||
|
update_query["isCrawlRunning"] = False
|
||||||
|
|
||||||
last_crawl_finished = last_crawl.get("finished")
|
last_crawl_finished = last_crawl.get("finished")
|
||||||
update_query["lastCrawlTime"] = last_crawl_finished
|
update_query["lastCrawlTime"] = last_crawl_finished
|
||||||
|
Loading…
Reference in New Issue
Block a user