Add last crawl's stats object to CrawlConfigOut (#2714)
Fixes #2709 Will allow us to display information about page counts (found, done) in the workflow list.
This commit is contained in:
		
							parent
							
								
									89027ef16e
								
							
						
					
					
						commit
						993f82a49b
					
				| @ -852,6 +852,7 @@ class CrawlConfigOps: | ||||
|                 update_query["lastCrawlSize"] = sum( | ||||
|                     file_.get("size", 0) for file_ in last_crawl.get("files", []) | ||||
|                 ) | ||||
|                 update_query["lastCrawlStats"] = last_crawl.get("stats") | ||||
|                 update_query["lastCrawlStopping"] = False | ||||
|                 update_query["isCrawlRunning"] = False | ||||
| 
 | ||||
| @ -866,6 +867,7 @@ class CrawlConfigOps: | ||||
|                 update_query["lastCrawlTime"] = None | ||||
|                 update_query["lastCrawlState"] = None | ||||
|                 update_query["lastCrawlSize"] = 0 | ||||
|                 update_query["lastCrawlStats"] = None | ||||
|                 update_query["lastRun"] = None | ||||
|                 update_query["isCrawlRunning"] = False | ||||
| 
 | ||||
| @ -895,6 +897,7 @@ class CrawlConfigOps: | ||||
|         crawlconfig.lastCrawlShouldPause = crawl.shouldPause | ||||
|         crawlconfig.lastCrawlPausedAt = crawl.pausedAt | ||||
|         crawlconfig.lastCrawlPausedExpiry = None | ||||
|         crawlconfig.lastCrawlStats = crawl.stats if crawl.stats else None | ||||
|         if crawl.pausedAt: | ||||
|             crawlconfig.lastCrawlPausedExpiry = ( | ||||
|                 crawl.pausedAt + self.paused_expiry_delta | ||||
| @ -1420,6 +1423,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID): | ||||
|             update_query["lastStartedByName"] = last_crawl.get("userName") | ||||
|             update_query["lastCrawlState"] = last_crawl.get("state") | ||||
|             update_query["lastCrawlSize"] = last_crawl_size | ||||
|             update_query["lastCrawlStats"] = last_crawl.get("stats") | ||||
|             update_query["lastCrawlStopping"] = False | ||||
|             update_query["isCrawlRunning"] = False | ||||
| 
 | ||||
|  | ||||
| @ -273,6 +273,15 @@ TYPE_ALL_CRAWL_STATES = Literal[ | ||||
| ALL_CRAWL_STATES = [*RUNNING_AND_WAITING_STATES, *NON_RUNNING_STATES] | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class CrawlStats(BaseModel): | ||||
|     """Crawl Stats for pages and size""" | ||||
| 
 | ||||
|     found: int = 0 | ||||
|     done: int = 0 | ||||
|     size: int = 0 | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| 
 | ||||
| ### CRAWL CONFIGS ### | ||||
| @ -510,6 +519,7 @@ class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional): | ||||
|     lastCrawlShouldPause: Optional[bool] = False | ||||
|     lastCrawlPausedAt: Optional[datetime] = None | ||||
|     lastCrawlPausedExpiry: Optional[datetime] = None | ||||
|     lastCrawlStats: Optional[CrawlStats] = None | ||||
|     profileName: Optional[str] = None | ||||
| 
 | ||||
|     createdByName: Optional[str] = None | ||||
| @ -772,15 +782,6 @@ class CrawlFileOut(BaseModel): | ||||
|     expireAt: Optional[str] = None | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class CrawlStats(BaseModel): | ||||
|     """Crawl Stats for pages and size""" | ||||
| 
 | ||||
|     found: int = 0 | ||||
|     done: int = 0 | ||||
|     size: int = 0 | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class CoreCrawlable(BaseModel): | ||||
|     # pylint: disable=too-few-public-methods | ||||
|  | ||||
| @ -520,6 +520,11 @@ def test_workflow_total_size_and_last_crawl_stats( | ||||
|             assert workflow["lastRun"] | ||||
|             assert workflow["lastCrawlSize"] > 0 | ||||
| 
 | ||||
|             stats = workflow["lastCrawlStats"] | ||||
|             assert stats["found"] > 0 | ||||
|             assert stats["done"] > 0 | ||||
|             assert stats["size"] > 0 | ||||
| 
 | ||||
|             if last_crawl_id == admin_crawl_id: | ||||
|                 global _admin_crawl_cid | ||||
|                 _admin_crawl_cid = workflow["id"] | ||||
| @ -545,6 +550,11 @@ def test_workflow_total_size_and_last_crawl_stats( | ||||
|     assert data["lastRun"] | ||||
|     assert data["lastCrawlSize"] > 0 | ||||
| 
 | ||||
|     stats = data["lastCrawlStats"] | ||||
|     assert stats["found"] > 0 | ||||
|     assert stats["done"] > 0 | ||||
|     assert stats["size"] > 0 | ||||
| 
 | ||||
| 
 | ||||
| def test_incremental_workflow_total_size_and_last_crawl_stats( | ||||
|     crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id | ||||
| @ -564,6 +574,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats( | ||||
|     last_crawl_started = data["lastCrawlStartTime"] | ||||
|     last_crawl_finished = data["lastCrawlTime"] | ||||
|     last_run = data["lastRun"] | ||||
|     last_stats = data["lastCrawlStats"] | ||||
| 
 | ||||
|     # Run new crawl in this workflow | ||||
|     r = requests.post( | ||||
| @ -602,6 +613,10 @@ def test_incremental_workflow_total_size_and_last_crawl_stats( | ||||
|     assert data["lastCrawlStartTime"] > last_crawl_started | ||||
|     assert data["lastCrawlTime"] > last_crawl_finished | ||||
|     assert data["lastRun"] > last_run | ||||
|     stats = data["lastCrawlStats"] | ||||
|     assert stats["found"] > 0 | ||||
|     assert stats["done"] > 0 | ||||
|     assert stats["size"] > 0 | ||||
| 
 | ||||
|     # Delete new crawl | ||||
|     r = requests.post( | ||||
| @ -628,6 +643,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats( | ||||
|     assert data["lastCrawlStartTime"] == last_crawl_started | ||||
|     assert data["lastCrawlTime"] == last_crawl_finished | ||||
|     assert data["lastRun"] == last_run | ||||
|     assert data["lastCrawlStats"] == last_stats | ||||
| 
 | ||||
| 
 | ||||
| def test_get_config_seeds(crawler_auth_headers, default_org_id, url_list_config_id): | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user