Add last crawl's stats object to CrawlConfigOut (#2714)

Fixes #2709 

Will allow us to display information about page counts (found, done) in
the workflow list.
This commit is contained in:
Tessa Walsh 2025-07-23 23:10:46 -04:00 committed by GitHub
parent 89027ef16e
commit 993f82a49b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 30 additions and 9 deletions

View File

@ -852,6 +852,7 @@ class CrawlConfigOps:
update_query["lastCrawlSize"] = sum(
file_.get("size", 0) for file_ in last_crawl.get("files", [])
)
update_query["lastCrawlStats"] = last_crawl.get("stats")
update_query["lastCrawlStopping"] = False
update_query["isCrawlRunning"] = False
@ -866,6 +867,7 @@ class CrawlConfigOps:
update_query["lastCrawlTime"] = None
update_query["lastCrawlState"] = None
update_query["lastCrawlSize"] = 0
update_query["lastCrawlStats"] = None
update_query["lastRun"] = None
update_query["isCrawlRunning"] = False
@ -895,6 +897,7 @@ class CrawlConfigOps:
crawlconfig.lastCrawlShouldPause = crawl.shouldPause
crawlconfig.lastCrawlPausedAt = crawl.pausedAt
crawlconfig.lastCrawlPausedExpiry = None
crawlconfig.lastCrawlStats = crawl.stats if crawl.stats else None
if crawl.pausedAt:
crawlconfig.lastCrawlPausedExpiry = (
crawl.pausedAt + self.paused_expiry_delta
@ -1420,6 +1423,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
update_query["lastStartedByName"] = last_crawl.get("userName")
update_query["lastCrawlState"] = last_crawl.get("state")
update_query["lastCrawlSize"] = last_crawl_size
update_query["lastCrawlStats"] = last_crawl.get("stats")
update_query["lastCrawlStopping"] = False
update_query["isCrawlRunning"] = False

View File

@ -273,6 +273,15 @@ TYPE_ALL_CRAWL_STATES = Literal[
ALL_CRAWL_STATES = [*RUNNING_AND_WAITING_STATES, *NON_RUNNING_STATES]
# ============================================================================
class CrawlStats(BaseModel):
"""Crawl Stats for pages and size"""
found: int = 0
done: int = 0
size: int = 0
# ============================================================================
### CRAWL CONFIGS ###
@ -510,6 +519,7 @@ class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional):
lastCrawlShouldPause: Optional[bool] = False
lastCrawlPausedAt: Optional[datetime] = None
lastCrawlPausedExpiry: Optional[datetime] = None
lastCrawlStats: Optional[CrawlStats] = None
profileName: Optional[str] = None
createdByName: Optional[str] = None
@ -772,15 +782,6 @@ class CrawlFileOut(BaseModel):
expireAt: Optional[str] = None
# ============================================================================
class CrawlStats(BaseModel):
"""Crawl Stats for pages and size"""
found: int = 0
done: int = 0
size: int = 0
# ============================================================================
class CoreCrawlable(BaseModel):
# pylint: disable=too-few-public-methods

View File

@ -520,6 +520,11 @@ def test_workflow_total_size_and_last_crawl_stats(
assert workflow["lastRun"]
assert workflow["lastCrawlSize"] > 0
stats = workflow["lastCrawlStats"]
assert stats["found"] > 0
assert stats["done"] > 0
assert stats["size"] > 0
if last_crawl_id == admin_crawl_id:
global _admin_crawl_cid
_admin_crawl_cid = workflow["id"]
@ -545,6 +550,11 @@ def test_workflow_total_size_and_last_crawl_stats(
assert data["lastRun"]
assert data["lastCrawlSize"] > 0
stats = data["lastCrawlStats"]
assert stats["found"] > 0
assert stats["done"] > 0
assert stats["size"] > 0
def test_incremental_workflow_total_size_and_last_crawl_stats(
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
@ -564,6 +574,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
last_crawl_started = data["lastCrawlStartTime"]
last_crawl_finished = data["lastCrawlTime"]
last_run = data["lastRun"]
last_stats = data["lastCrawlStats"]
# Run new crawl in this workflow
r = requests.post(
@ -602,6 +613,10 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
assert data["lastCrawlStartTime"] > last_crawl_started
assert data["lastCrawlTime"] > last_crawl_finished
assert data["lastRun"] > last_run
stats = data["lastCrawlStats"]
assert stats["found"] > 0
assert stats["done"] > 0
assert stats["size"] > 0
# Delete new crawl
r = requests.post(
@ -628,6 +643,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
assert data["lastCrawlStartTime"] == last_crawl_started
assert data["lastCrawlTime"] == last_crawl_finished
assert data["lastRun"] == last_run
assert data["lastCrawlStats"] == last_stats
def test_get_config_seeds(crawler_auth_headers, default_org_id, url_list_config_id):