Issue 2396 readd pages fixes (#2398)
readd pages fixes:
    - add additional mem to background job
- copy page qa data to separate temp coll when re-adding pages, then
merge back in
			
			
This commit is contained in:
		
							parent
							
								
									e112f96614
								
							
						
					
					
						commit
						5bebb6161a
					
				| @ -646,6 +646,13 @@ class CrawlOps(BaseCrawlOps): | ||||
|             return None, None | ||||
|         return res.get("state"), res.get("finished") | ||||
| 
 | ||||
|     async def is_upload(self, crawl_id: str): | ||||
|         """return true if archived item with this id is an upload""" | ||||
|         res = await self.crawls.find_one({"_id": crawl_id}, projection={"type": 1}) | ||||
|         if not res: | ||||
|             return False | ||||
|         return res.get("type") == "upload" | ||||
| 
 | ||||
|     async def add_crawl_error( | ||||
|         self, | ||||
|         crawl_id: str, | ||||
|  | ||||
| @ -66,6 +66,7 @@ class PageOps: | ||||
|     ): | ||||
|         self.pages = mdb["pages"] | ||||
|         self.crawls = mdb["crawls"] | ||||
|         self.mdb = mdb | ||||
|         self.crawl_ops = crawl_ops | ||||
|         self.org_ops = org_ops | ||||
|         self.storage_ops = storage_ops | ||||
| @ -785,9 +786,70 @@ class PageOps: | ||||
| 
 | ||||
|     async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): | ||||
|         """Delete existing pages for crawl and re-add from WACZs.""" | ||||
|         await self.delete_crawl_pages(crawl_id, oid) | ||||
|         print(f"Deleted pages for crawl {crawl_id}", flush=True) | ||||
|         await self.add_crawl_pages_to_db_from_wacz(crawl_id) | ||||
| 
 | ||||
|         try: | ||||
|             is_upload = await self.crawl_ops.is_upload(crawl_id) | ||||
|             print(f"Processing {'upload' if is_upload else 'crawl'} {crawl_id}") | ||||
|             if not is_upload: | ||||
|                 ts_now = dt_now().strftime("%Y%m%d%H%M%S") | ||||
|                 qa_temp_db_name = f"pages-qa-temp-{crawl_id}-{ts_now}" | ||||
|                 cursor = self.pages.aggregate( | ||||
|                     [ | ||||
|                         { | ||||
|                             "$match": { | ||||
|                                 "crawl_id": crawl_id, | ||||
|                                 "$or": [ | ||||
|                                     {"qa": {"$nin": [None, {}]}}, | ||||
|                                     {"modified": {"$ne": None}}, | ||||
|                                     {"userid": {"$ne": None}}, | ||||
|                                     {"approved": {"$ne": None}}, | ||||
|                                     {"notes": {"$ne": None}}, | ||||
|                                 ], | ||||
|                             } | ||||
|                         }, | ||||
|                         { | ||||
|                             "$project": { | ||||
|                                 "_id": 1, | ||||
|                                 "qa": 1, | ||||
|                                 "modified": 1, | ||||
|                                 "userid": 1, | ||||
|                                 "approved": 1, | ||||
|                                 "notes": 1, | ||||
|                             } | ||||
|                         }, | ||||
|                         {"$out": qa_temp_db_name}, | ||||
|                     ] | ||||
|                 ) | ||||
|                 print(f"Stored QA data in temp db {qa_temp_db_name}") | ||||
|                 assert await cursor.to_list() == [] | ||||
| 
 | ||||
|             await self.delete_crawl_pages(crawl_id, oid) | ||||
|             print(f"Deleted pages for crawl {crawl_id}", flush=True) | ||||
|             await self.add_crawl_pages_to_db_from_wacz(crawl_id) | ||||
| 
 | ||||
|             if not is_upload: | ||||
|                 qa_temp_db = self.mdb[qa_temp_db_name] | ||||
|                 cursor = qa_temp_db.aggregate( | ||||
|                     [ | ||||
|                         { | ||||
|                             "$merge": { | ||||
|                                 "into": "pages", | ||||
|                                 "on": ["_id"], | ||||
|                                 "whenNotMatched": "fail", | ||||
|                             } | ||||
|                         } | ||||
|                     ] | ||||
|                 ) | ||||
|                 print(f"Merged QA data from temp db {qa_temp_db_name}") | ||||
|                 # async for data in qa_temp_db.find({}): | ||||
|                 #    print("qa data", data) | ||||
| 
 | ||||
|                 assert await cursor.to_list() == [] | ||||
|                 await qa_temp_db.drop() | ||||
|                 print(f"Dropped temp db {qa_temp_db_name}") | ||||
|         # pylint: disable=broad-exception-caught | ||||
|         except Exception as e: | ||||
|             print(e) | ||||
| 
 | ||||
|     async def re_add_all_crawl_pages( | ||||
|         self, org: Organization, crawl_type: Optional[str] = None | ||||
|  | ||||
| @ -55,8 +55,8 @@ spec: | ||||
| 
 | ||||
|           resources: | ||||
|             limits: | ||||
|               memory: "200Mi" | ||||
|               memory: "500Mi" | ||||
| 
 | ||||
|             requests: | ||||
|               memory: "200Mi" | ||||
|               cpu: "50m" | ||||
|               memory: "250Mi" | ||||
|               cpu: "200m" | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user