Add last crawl's stats object to CrawlConfigOut (#2714)
Fixes #2709 Will allow us to display information about page counts (found, done) in the workflow list.
This commit is contained in:
		
							parent
							
								
									89027ef16e
								
							
						
					
					
						commit
						993f82a49b
					
				| @ -852,6 +852,7 @@ class CrawlConfigOps: | |||||||
|                 update_query["lastCrawlSize"] = sum( |                 update_query["lastCrawlSize"] = sum( | ||||||
|                     file_.get("size", 0) for file_ in last_crawl.get("files", []) |                     file_.get("size", 0) for file_ in last_crawl.get("files", []) | ||||||
|                 ) |                 ) | ||||||
|  |                 update_query["lastCrawlStats"] = last_crawl.get("stats") | ||||||
|                 update_query["lastCrawlStopping"] = False |                 update_query["lastCrawlStopping"] = False | ||||||
|                 update_query["isCrawlRunning"] = False |                 update_query["isCrawlRunning"] = False | ||||||
| 
 | 
 | ||||||
| @ -866,6 +867,7 @@ class CrawlConfigOps: | |||||||
|                 update_query["lastCrawlTime"] = None |                 update_query["lastCrawlTime"] = None | ||||||
|                 update_query["lastCrawlState"] = None |                 update_query["lastCrawlState"] = None | ||||||
|                 update_query["lastCrawlSize"] = 0 |                 update_query["lastCrawlSize"] = 0 | ||||||
|  |                 update_query["lastCrawlStats"] = None | ||||||
|                 update_query["lastRun"] = None |                 update_query["lastRun"] = None | ||||||
|                 update_query["isCrawlRunning"] = False |                 update_query["isCrawlRunning"] = False | ||||||
| 
 | 
 | ||||||
| @ -895,6 +897,7 @@ class CrawlConfigOps: | |||||||
|         crawlconfig.lastCrawlShouldPause = crawl.shouldPause |         crawlconfig.lastCrawlShouldPause = crawl.shouldPause | ||||||
|         crawlconfig.lastCrawlPausedAt = crawl.pausedAt |         crawlconfig.lastCrawlPausedAt = crawl.pausedAt | ||||||
|         crawlconfig.lastCrawlPausedExpiry = None |         crawlconfig.lastCrawlPausedExpiry = None | ||||||
|  |         crawlconfig.lastCrawlStats = crawl.stats if crawl.stats else None | ||||||
|         if crawl.pausedAt: |         if crawl.pausedAt: | ||||||
|             crawlconfig.lastCrawlPausedExpiry = ( |             crawlconfig.lastCrawlPausedExpiry = ( | ||||||
|                 crawl.pausedAt + self.paused_expiry_delta |                 crawl.pausedAt + self.paused_expiry_delta | ||||||
| @ -1420,6 +1423,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID): | |||||||
|             update_query["lastStartedByName"] = last_crawl.get("userName") |             update_query["lastStartedByName"] = last_crawl.get("userName") | ||||||
|             update_query["lastCrawlState"] = last_crawl.get("state") |             update_query["lastCrawlState"] = last_crawl.get("state") | ||||||
|             update_query["lastCrawlSize"] = last_crawl_size |             update_query["lastCrawlSize"] = last_crawl_size | ||||||
|  |             update_query["lastCrawlStats"] = last_crawl.get("stats") | ||||||
|             update_query["lastCrawlStopping"] = False |             update_query["lastCrawlStopping"] = False | ||||||
|             update_query["isCrawlRunning"] = False |             update_query["isCrawlRunning"] = False | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -273,6 +273,15 @@ TYPE_ALL_CRAWL_STATES = Literal[ | |||||||
| ALL_CRAWL_STATES = [*RUNNING_AND_WAITING_STATES, *NON_RUNNING_STATES] | ALL_CRAWL_STATES = [*RUNNING_AND_WAITING_STATES, *NON_RUNNING_STATES] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class CrawlStats(BaseModel): | ||||||
|  |     """Crawl Stats for pages and size""" | ||||||
|  | 
 | ||||||
|  |     found: int = 0 | ||||||
|  |     done: int = 0 | ||||||
|  |     size: int = 0 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| # ============================================================================ | # ============================================================================ | ||||||
| 
 | 
 | ||||||
| ### CRAWL CONFIGS ### | ### CRAWL CONFIGS ### | ||||||
| @ -510,6 +519,7 @@ class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional): | |||||||
|     lastCrawlShouldPause: Optional[bool] = False |     lastCrawlShouldPause: Optional[bool] = False | ||||||
|     lastCrawlPausedAt: Optional[datetime] = None |     lastCrawlPausedAt: Optional[datetime] = None | ||||||
|     lastCrawlPausedExpiry: Optional[datetime] = None |     lastCrawlPausedExpiry: Optional[datetime] = None | ||||||
|  |     lastCrawlStats: Optional[CrawlStats] = None | ||||||
|     profileName: Optional[str] = None |     profileName: Optional[str] = None | ||||||
| 
 | 
 | ||||||
|     createdByName: Optional[str] = None |     createdByName: Optional[str] = None | ||||||
| @ -772,15 +782,6 @@ class CrawlFileOut(BaseModel): | |||||||
|     expireAt: Optional[str] = None |     expireAt: Optional[str] = None | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # ============================================================================ |  | ||||||
| class CrawlStats(BaseModel): |  | ||||||
|     """Crawl Stats for pages and size""" |  | ||||||
| 
 |  | ||||||
|     found: int = 0 |  | ||||||
|     done: int = 0 |  | ||||||
|     size: int = 0 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ | # ============================================================================ | ||||||
| class CoreCrawlable(BaseModel): | class CoreCrawlable(BaseModel): | ||||||
|     # pylint: disable=too-few-public-methods |     # pylint: disable=too-few-public-methods | ||||||
|  | |||||||
| @ -520,6 +520,11 @@ def test_workflow_total_size_and_last_crawl_stats( | |||||||
|             assert workflow["lastRun"] |             assert workflow["lastRun"] | ||||||
|             assert workflow["lastCrawlSize"] > 0 |             assert workflow["lastCrawlSize"] > 0 | ||||||
| 
 | 
 | ||||||
|  |             stats = workflow["lastCrawlStats"] | ||||||
|  |             assert stats["found"] > 0 | ||||||
|  |             assert stats["done"] > 0 | ||||||
|  |             assert stats["size"] > 0 | ||||||
|  | 
 | ||||||
|             if last_crawl_id == admin_crawl_id: |             if last_crawl_id == admin_crawl_id: | ||||||
|                 global _admin_crawl_cid |                 global _admin_crawl_cid | ||||||
|                 _admin_crawl_cid = workflow["id"] |                 _admin_crawl_cid = workflow["id"] | ||||||
| @ -545,6 +550,11 @@ def test_workflow_total_size_and_last_crawl_stats( | |||||||
|     assert data["lastRun"] |     assert data["lastRun"] | ||||||
|     assert data["lastCrawlSize"] > 0 |     assert data["lastCrawlSize"] > 0 | ||||||
| 
 | 
 | ||||||
|  |     stats = data["lastCrawlStats"] | ||||||
|  |     assert stats["found"] > 0 | ||||||
|  |     assert stats["done"] > 0 | ||||||
|  |     assert stats["size"] > 0 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def test_incremental_workflow_total_size_and_last_crawl_stats( | def test_incremental_workflow_total_size_and_last_crawl_stats( | ||||||
|     crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id |     crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id | ||||||
| @ -564,6 +574,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats( | |||||||
|     last_crawl_started = data["lastCrawlStartTime"] |     last_crawl_started = data["lastCrawlStartTime"] | ||||||
|     last_crawl_finished = data["lastCrawlTime"] |     last_crawl_finished = data["lastCrawlTime"] | ||||||
|     last_run = data["lastRun"] |     last_run = data["lastRun"] | ||||||
|  |     last_stats = data["lastCrawlStats"] | ||||||
| 
 | 
 | ||||||
|     # Run new crawl in this workflow |     # Run new crawl in this workflow | ||||||
|     r = requests.post( |     r = requests.post( | ||||||
| @ -602,6 +613,10 @@ def test_incremental_workflow_total_size_and_last_crawl_stats( | |||||||
|     assert data["lastCrawlStartTime"] > last_crawl_started |     assert data["lastCrawlStartTime"] > last_crawl_started | ||||||
|     assert data["lastCrawlTime"] > last_crawl_finished |     assert data["lastCrawlTime"] > last_crawl_finished | ||||||
|     assert data["lastRun"] > last_run |     assert data["lastRun"] > last_run | ||||||
|  |     stats = data["lastCrawlStats"] | ||||||
|  |     assert stats["found"] > 0 | ||||||
|  |     assert stats["done"] > 0 | ||||||
|  |     assert stats["size"] > 0 | ||||||
| 
 | 
 | ||||||
|     # Delete new crawl |     # Delete new crawl | ||||||
|     r = requests.post( |     r = requests.post( | ||||||
| @ -628,6 +643,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats( | |||||||
|     assert data["lastCrawlStartTime"] == last_crawl_started |     assert data["lastCrawlStartTime"] == last_crawl_started | ||||||
|     assert data["lastCrawlTime"] == last_crawl_finished |     assert data["lastCrawlTime"] == last_crawl_finished | ||||||
|     assert data["lastRun"] == last_run |     assert data["lastRun"] == last_run | ||||||
|  |     assert data["lastCrawlStats"] == last_stats | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_get_config_seeds(crawler_auth_headers, default_org_id, url_list_config_id): | def test_get_config_seeds(crawler_auth_headers, default_org_id, url_list_config_id): | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user