Add last crawl's stats object to CrawlConfigOut (#2714)
Fixes #2709 Will allow us to display information about page counts (found, done) in the workflow list.
This commit is contained in:
parent
89027ef16e
commit
993f82a49b
@ -852,6 +852,7 @@ class CrawlConfigOps:
|
||||
update_query["lastCrawlSize"] = sum(
|
||||
file_.get("size", 0) for file_ in last_crawl.get("files", [])
|
||||
)
|
||||
update_query["lastCrawlStats"] = last_crawl.get("stats")
|
||||
update_query["lastCrawlStopping"] = False
|
||||
update_query["isCrawlRunning"] = False
|
||||
|
||||
@ -866,6 +867,7 @@ class CrawlConfigOps:
|
||||
update_query["lastCrawlTime"] = None
|
||||
update_query["lastCrawlState"] = None
|
||||
update_query["lastCrawlSize"] = 0
|
||||
update_query["lastCrawlStats"] = None
|
||||
update_query["lastRun"] = None
|
||||
update_query["isCrawlRunning"] = False
|
||||
|
||||
@ -895,6 +897,7 @@ class CrawlConfigOps:
|
||||
crawlconfig.lastCrawlShouldPause = crawl.shouldPause
|
||||
crawlconfig.lastCrawlPausedAt = crawl.pausedAt
|
||||
crawlconfig.lastCrawlPausedExpiry = None
|
||||
crawlconfig.lastCrawlStats = crawl.stats if crawl.stats else None
|
||||
if crawl.pausedAt:
|
||||
crawlconfig.lastCrawlPausedExpiry = (
|
||||
crawl.pausedAt + self.paused_expiry_delta
|
||||
@ -1420,6 +1423,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
|
||||
update_query["lastStartedByName"] = last_crawl.get("userName")
|
||||
update_query["lastCrawlState"] = last_crawl.get("state")
|
||||
update_query["lastCrawlSize"] = last_crawl_size
|
||||
update_query["lastCrawlStats"] = last_crawl.get("stats")
|
||||
update_query["lastCrawlStopping"] = False
|
||||
update_query["isCrawlRunning"] = False
|
||||
|
||||
|
@ -273,6 +273,15 @@ TYPE_ALL_CRAWL_STATES = Literal[
|
||||
ALL_CRAWL_STATES = [*RUNNING_AND_WAITING_STATES, *NON_RUNNING_STATES]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CrawlStats(BaseModel):
|
||||
"""Crawl Stats for pages and size"""
|
||||
|
||||
found: int = 0
|
||||
done: int = 0
|
||||
size: int = 0
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
||||
### CRAWL CONFIGS ###
|
||||
@ -510,6 +519,7 @@ class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional):
|
||||
lastCrawlShouldPause: Optional[bool] = False
|
||||
lastCrawlPausedAt: Optional[datetime] = None
|
||||
lastCrawlPausedExpiry: Optional[datetime] = None
|
||||
lastCrawlStats: Optional[CrawlStats] = None
|
||||
profileName: Optional[str] = None
|
||||
|
||||
createdByName: Optional[str] = None
|
||||
@ -772,15 +782,6 @@ class CrawlFileOut(BaseModel):
|
||||
expireAt: Optional[str] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CrawlStats(BaseModel):
|
||||
"""Crawl Stats for pages and size"""
|
||||
|
||||
found: int = 0
|
||||
done: int = 0
|
||||
size: int = 0
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CoreCrawlable(BaseModel):
|
||||
# pylint: disable=too-few-public-methods
|
||||
|
@ -520,6 +520,11 @@ def test_workflow_total_size_and_last_crawl_stats(
|
||||
assert workflow["lastRun"]
|
||||
assert workflow["lastCrawlSize"] > 0
|
||||
|
||||
stats = workflow["lastCrawlStats"]
|
||||
assert stats["found"] > 0
|
||||
assert stats["done"] > 0
|
||||
assert stats["size"] > 0
|
||||
|
||||
if last_crawl_id == admin_crawl_id:
|
||||
global _admin_crawl_cid
|
||||
_admin_crawl_cid = workflow["id"]
|
||||
@ -545,6 +550,11 @@ def test_workflow_total_size_and_last_crawl_stats(
|
||||
assert data["lastRun"]
|
||||
assert data["lastCrawlSize"] > 0
|
||||
|
||||
stats = data["lastCrawlStats"]
|
||||
assert stats["found"] > 0
|
||||
assert stats["done"] > 0
|
||||
assert stats["size"] > 0
|
||||
|
||||
|
||||
def test_incremental_workflow_total_size_and_last_crawl_stats(
|
||||
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
|
||||
@ -564,6 +574,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
|
||||
last_crawl_started = data["lastCrawlStartTime"]
|
||||
last_crawl_finished = data["lastCrawlTime"]
|
||||
last_run = data["lastRun"]
|
||||
last_stats = data["lastCrawlStats"]
|
||||
|
||||
# Run new crawl in this workflow
|
||||
r = requests.post(
|
||||
@ -602,6 +613,10 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
|
||||
assert data["lastCrawlStartTime"] > last_crawl_started
|
||||
assert data["lastCrawlTime"] > last_crawl_finished
|
||||
assert data["lastRun"] > last_run
|
||||
stats = data["lastCrawlStats"]
|
||||
assert stats["found"] > 0
|
||||
assert stats["done"] > 0
|
||||
assert stats["size"] > 0
|
||||
|
||||
# Delete new crawl
|
||||
r = requests.post(
|
||||
@ -628,6 +643,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
|
||||
assert data["lastCrawlStartTime"] == last_crawl_started
|
||||
assert data["lastCrawlTime"] == last_crawl_finished
|
||||
assert data["lastRun"] == last_run
|
||||
assert data["lastCrawlStats"] == last_stats
|
||||
|
||||
|
||||
def test_get_config_seeds(crawler_auth_headers, default_org_id, url_list_config_id):
|
||||
|
Loading…
Reference in New Issue
Block a user