Issue 2396 readd pages fixes (#2398)
readd pages fixes:
    - add additional mem to background job
- copy page qa data to separate temp coll when re-adding pages, then
merge back in
			
			
This commit is contained in:
		
							parent
							
								
									e112f96614
								
							
						
					
					
						commit
						5bebb6161a
					
				| @ -646,6 +646,13 @@ class CrawlOps(BaseCrawlOps): | |||||||
|             return None, None |             return None, None | ||||||
|         return res.get("state"), res.get("finished") |         return res.get("state"), res.get("finished") | ||||||
| 
 | 
 | ||||||
|  |     async def is_upload(self, crawl_id: str): | ||||||
|  |         """return true if archived item with this id is an upload""" | ||||||
|  |         res = await self.crawls.find_one({"_id": crawl_id}, projection={"type": 1}) | ||||||
|  |         if not res: | ||||||
|  |             return False | ||||||
|  |         return res.get("type") == "upload" | ||||||
|  | 
 | ||||||
|     async def add_crawl_error( |     async def add_crawl_error( | ||||||
|         self, |         self, | ||||||
|         crawl_id: str, |         crawl_id: str, | ||||||
|  | |||||||
| @ -66,6 +66,7 @@ class PageOps: | |||||||
|     ): |     ): | ||||||
|         self.pages = mdb["pages"] |         self.pages = mdb["pages"] | ||||||
|         self.crawls = mdb["crawls"] |         self.crawls = mdb["crawls"] | ||||||
|  |         self.mdb = mdb | ||||||
|         self.crawl_ops = crawl_ops |         self.crawl_ops = crawl_ops | ||||||
|         self.org_ops = org_ops |         self.org_ops = org_ops | ||||||
|         self.storage_ops = storage_ops |         self.storage_ops = storage_ops | ||||||
| @ -785,9 +786,70 @@ class PageOps: | |||||||
| 
 | 
 | ||||||
|     async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): |     async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): | ||||||
|         """Delete existing pages for crawl and re-add from WACZs.""" |         """Delete existing pages for crawl and re-add from WACZs.""" | ||||||
|         await self.delete_crawl_pages(crawl_id, oid) | 
 | ||||||
|         print(f"Deleted pages for crawl {crawl_id}", flush=True) |         try: | ||||||
|         await self.add_crawl_pages_to_db_from_wacz(crawl_id) |             is_upload = await self.crawl_ops.is_upload(crawl_id) | ||||||
|  |             print(f"Processing {'upload' if is_upload else 'crawl'} {crawl_id}") | ||||||
|  |             if not is_upload: | ||||||
|  |                 ts_now = dt_now().strftime("%Y%m%d%H%M%S") | ||||||
|  |                 qa_temp_db_name = f"pages-qa-temp-{crawl_id}-{ts_now}" | ||||||
|  |                 cursor = self.pages.aggregate( | ||||||
|  |                     [ | ||||||
|  |                         { | ||||||
|  |                             "$match": { | ||||||
|  |                                 "crawl_id": crawl_id, | ||||||
|  |                                 "$or": [ | ||||||
|  |                                     {"qa": {"$nin": [None, {}]}}, | ||||||
|  |                                     {"modified": {"$ne": None}}, | ||||||
|  |                                     {"userid": {"$ne": None}}, | ||||||
|  |                                     {"approved": {"$ne": None}}, | ||||||
|  |                                     {"notes": {"$ne": None}}, | ||||||
|  |                                 ], | ||||||
|  |                             } | ||||||
|  |                         }, | ||||||
|  |                         { | ||||||
|  |                             "$project": { | ||||||
|  |                                 "_id": 1, | ||||||
|  |                                 "qa": 1, | ||||||
|  |                                 "modified": 1, | ||||||
|  |                                 "userid": 1, | ||||||
|  |                                 "approved": 1, | ||||||
|  |                                 "notes": 1, | ||||||
|  |                             } | ||||||
|  |                         }, | ||||||
|  |                         {"$out": qa_temp_db_name}, | ||||||
|  |                     ] | ||||||
|  |                 ) | ||||||
|  |                 print(f"Stored QA data in temp db {qa_temp_db_name}") | ||||||
|  |                 assert await cursor.to_list() == [] | ||||||
|  | 
 | ||||||
|  |             await self.delete_crawl_pages(crawl_id, oid) | ||||||
|  |             print(f"Deleted pages for crawl {crawl_id}", flush=True) | ||||||
|  |             await self.add_crawl_pages_to_db_from_wacz(crawl_id) | ||||||
|  | 
 | ||||||
|  |             if not is_upload: | ||||||
|  |                 qa_temp_db = self.mdb[qa_temp_db_name] | ||||||
|  |                 cursor = qa_temp_db.aggregate( | ||||||
|  |                     [ | ||||||
|  |                         { | ||||||
|  |                             "$merge": { | ||||||
|  |                                 "into": "pages", | ||||||
|  |                                 "on": ["_id"], | ||||||
|  |                                 "whenNotMatched": "fail", | ||||||
|  |                             } | ||||||
|  |                         } | ||||||
|  |                     ] | ||||||
|  |                 ) | ||||||
|  |                 print(f"Merged QA data from temp db {qa_temp_db_name}") | ||||||
|  |                 # async for data in qa_temp_db.find({}): | ||||||
|  |                 #    print("qa data", data) | ||||||
|  | 
 | ||||||
|  |                 assert await cursor.to_list() == [] | ||||||
|  |                 await qa_temp_db.drop() | ||||||
|  |                 print(f"Dropped temp db {qa_temp_db_name}") | ||||||
|  |         # pylint: disable=broad-exception-caught | ||||||
|  |         except Exception as e: | ||||||
|  |             print(e) | ||||||
| 
 | 
 | ||||||
|     async def re_add_all_crawl_pages( |     async def re_add_all_crawl_pages( | ||||||
|         self, org: Organization, crawl_type: Optional[str] = None |         self, org: Organization, crawl_type: Optional[str] = None | ||||||
|  | |||||||
| @ -55,8 +55,8 @@ spec: | |||||||
| 
 | 
 | ||||||
|           resources: |           resources: | ||||||
|             limits: |             limits: | ||||||
|               memory: "200Mi" |               memory: "500Mi" | ||||||
| 
 | 
 | ||||||
|             requests: |             requests: | ||||||
|               memory: "200Mi" |               memory: "250Mi" | ||||||
|               cpu: "50m" |               cpu: "200m" | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user