operator fixes: (#834)
- just pass cid from operator for consistency, don't load crawl from update_crawl (different object) - don't throw in update_config_crawl_stats() to avoid exception in operator, only throw in crawlconfigs api
This commit is contained in:
		
							parent
							
								
									f992704491
								
							
						
					
					
						commit
						b40d599e17
					
				| @ -622,7 +622,11 @@ class CrawlConfigOps: | ||||
| 
 | ||||
|     async def update_crawl_stats(self, cid: uuid.UUID): | ||||
|         """Update crawl count, total size, and last crawl information for config.""" | ||||
|         await update_config_crawl_stats(self.crawl_configs, self.crawls, cid) | ||||
|         result = await update_config_crawl_stats(self.crawl_configs, self.crawls, cid) | ||||
|         if not result: | ||||
|             raise HTTPException( | ||||
|                 status_code=404, detail=f"Crawl Config '{cid}' not found to update" | ||||
|             ) | ||||
| 
 | ||||
|     def _add_curr_crawl_stats(self, crawlconfig, crawl): | ||||
|         """Add stats from current running crawl, if any""" | ||||
| @ -921,11 +925,6 @@ async def update_config_crawl_stats(crawl_configs, crawls, cid: uuid.UUID): | ||||
|         return_document=pymongo.ReturnDocument.AFTER, | ||||
|     ) | ||||
| 
 | ||||
|     if not result: | ||||
|         raise HTTPException( | ||||
|             status_code=404, detail=f"Crawl Config '{cid}' not found to update" | ||||
|         ) | ||||
| 
 | ||||
|     return result | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
| @ -139,6 +139,7 @@ class BtrixOperator(K8sAPI): | ||||
| 
 | ||||
|         spec = data.parent.get("spec", {}) | ||||
|         crawl_id = spec["id"] | ||||
|         cid = spec["cid"] | ||||
| 
 | ||||
|         scale = spec.get("scale", 1) | ||||
|         status.scale = scale | ||||
| @ -149,21 +150,19 @@ class BtrixOperator(K8sAPI): | ||||
|         if data.finalizing: | ||||
|             # if not yet finished, assume it was canceled, mark as such | ||||
|             if not status.finished: | ||||
|                 await self.cancel_crawl(redis_url, crawl_id, status, "canceled") | ||||
|                 await self.cancel_crawl(redis_url, crawl_id, cid, status, "canceled") | ||||
| 
 | ||||
|             return await self.finalize_crawl(crawl_id, status, data.related) | ||||
| 
 | ||||
|         if status.finished: | ||||
|             return await self.handle_finished_delete_if_needed(crawl_id, status, spec) | ||||
| 
 | ||||
|         cid = spec["cid"] | ||||
| 
 | ||||
|         try: | ||||
|             configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"] | ||||
|         # pylint: disable=bare-except, broad-except | ||||
|         except: | ||||
|             # fail crawl if config somehow missing, shouldn't generally happen | ||||
|             await self.cancel_crawl(redis_url, crawl_id, status, "failed") | ||||
|             await self.cancel_crawl(redis_url, crawl_id, cid, status, "failed") | ||||
| 
 | ||||
|             return self._done_response(status) | ||||
| 
 | ||||
| @ -278,10 +277,10 @@ class BtrixOperator(K8sAPI): | ||||
|             print("PVC Delete failed", exc, flush=True) | ||||
| 
 | ||||
|     # pylint: disable=too-many-arguments | ||||
|     async def cancel_crawl(self, redis_url, crawl_id, status, state): | ||||
|     async def cancel_crawl(self, redis_url, crawl_id, cid, status, state): | ||||
|         """immediately cancel crawl with specified state""" | ||||
|         redis = await self._get_redis(redis_url) | ||||
|         await self.mark_finished(redis, crawl_id, status, state) | ||||
|         await self.mark_finished(redis, crawl_id, cid, status, state) | ||||
| 
 | ||||
|     def _done_response(self, status, finalized=False): | ||||
|         """done response for removing crawl""" | ||||
| @ -421,25 +420,29 @@ class BtrixOperator(K8sAPI): | ||||
|             # check if one-page crawls actually succeeded | ||||
|             # if only one page found, and no files, assume failed | ||||
|             if status.pagesFound == 1 and not status.filesAdded: | ||||
|                 return await self.mark_finished(redis, crawl.id, status, state="failed") | ||||
|                 return await self.mark_finished( | ||||
|                     redis, crawl.id, crawl.cid, status, state="failed" | ||||
|                 ) | ||||
| 
 | ||||
|             completed = status.pagesDone and status.pagesDone >= status.pagesFound | ||||
| 
 | ||||
|             state = "complete" if completed else "partial_complete" | ||||
| 
 | ||||
|             status = await self.mark_finished( | ||||
|                 redis, crawl.id, status, state, crawl, stats | ||||
|                 redis, crawl.id, crawl.cid, status, state, crawl, stats | ||||
|             ) | ||||
| 
 | ||||
|         # check if all crawlers failed | ||||
|         if failed >= crawl.scale: | ||||
|             status = await self.mark_finished(redis, crawl.id, status, state="failed") | ||||
|             status = await self.mark_finished( | ||||
|                 redis, crawl.id, crawl.cid, status, state="failed" | ||||
|             ) | ||||
| 
 | ||||
|         return status | ||||
| 
 | ||||
|     # pylint: disable=too-many-arguments | ||||
|     async def mark_finished( | ||||
|         self, redis, crawl_id, status, state, crawl=None, stats=None | ||||
|         self, redis, crawl_id, cid, status, state, crawl=None, stats=None | ||||
|     ): | ||||
|         """mark crawl as finished, set finished timestamp and final state""" | ||||
|         finished = dt_now() | ||||
| @ -448,10 +451,9 @@ class BtrixOperator(K8sAPI): | ||||
|         if stats: | ||||
|             kwargs["stats"] = stats | ||||
| 
 | ||||
|         crawl = await update_crawl(self.crawls, crawl_id, **kwargs) | ||||
|         crawl_cid = crawl.get("cid") | ||||
|         await update_crawl(self.crawls, crawl_id, **kwargs) | ||||
| 
 | ||||
|         await update_config_crawl_stats(self.crawl_configs, self.crawls, crawl_cid) | ||||
|         await update_config_crawl_stats(self.crawl_configs, self.crawls, cid) | ||||
| 
 | ||||
|         if redis: | ||||
|             await self.add_crawl_errors_to_db(redis, crawl_id) | ||||
|  | ||||
| @ -67,10 +67,12 @@ def test_get_configs_by_description( | ||||
|         assert config["description"] == description | ||||
| 
 | ||||
| 
 | ||||
| def test_get_configs_by_schedule_true(crawler_auth_headers, default_org_id, crawler_crawl_id): | ||||
| def test_get_configs_by_schedule_true( | ||||
|     crawler_auth_headers, default_org_id, crawler_crawl_id | ||||
| ): | ||||
|     r = requests.get( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?schedule=True", | ||||
|         headers=crawler_auth_headers | ||||
|         headers=crawler_auth_headers, | ||||
|     ) | ||||
|     data = r.json() | ||||
|     assert data["total"] == 1 | ||||
| @ -78,10 +80,12 @@ def test_get_configs_by_schedule_true(crawler_auth_headers, default_org_id, craw | ||||
|     assert workflow.get("schedule") not in ("", None) | ||||
| 
 | ||||
| 
 | ||||
| def test_get_configs_by_schedule_false(crawler_auth_headers, default_org_id, crawler_crawl_id): | ||||
| def test_get_configs_by_schedule_false( | ||||
|     crawler_auth_headers, default_org_id, crawler_crawl_id | ||||
| ): | ||||
|     r = requests.get( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?schedule=False", | ||||
|         headers=crawler_auth_headers | ||||
|         headers=crawler_auth_headers, | ||||
|     ) | ||||
|     data = r.json() | ||||
|     assert data["total"] >= 1 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user