diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index b8c00b09..87ea7e3a 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -646,6 +646,13 @@ class CrawlOps(BaseCrawlOps): return None, None return res.get("state"), res.get("finished") + async def is_upload(self, crawl_id: str): + """return true if archived item with this id is an upload""" + res = await self.crawls.find_one({"_id": crawl_id}, projection={"type": 1}) + if not res: + return False + return res.get("type") == "upload" + async def add_crawl_error( self, crawl_id: str, diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 8ce69a58..9bcb397c 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -66,6 +66,7 @@ class PageOps: ): self.pages = mdb["pages"] self.crawls = mdb["crawls"] + self.mdb = mdb self.crawl_ops = crawl_ops self.org_ops = org_ops self.storage_ops = storage_ops @@ -785,9 +786,70 @@ class PageOps: async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): """Delete existing pages for crawl and re-add from WACZs.""" - await self.delete_crawl_pages(crawl_id, oid) - print(f"Deleted pages for crawl {crawl_id}", flush=True) - await self.add_crawl_pages_to_db_from_wacz(crawl_id) + + try: + is_upload = await self.crawl_ops.is_upload(crawl_id) + print(f"Processing {'upload' if is_upload else 'crawl'} {crawl_id}") + if not is_upload: + ts_now = dt_now().strftime("%Y%m%d%H%M%S") + qa_temp_db_name = f"pages-qa-temp-{crawl_id}-{ts_now}" + cursor = self.pages.aggregate( + [ + { + "$match": { + "crawl_id": crawl_id, + "$or": [ + {"qa": {"$nin": [None, {}]}}, + {"modified": {"$ne": None}}, + {"userid": {"$ne": None}}, + {"approved": {"$ne": None}}, + {"notes": {"$ne": None}}, + ], + } + }, + { + "$project": { + "_id": 1, + "qa": 1, + "modified": 1, + "userid": 1, + "approved": 1, + "notes": 1, + } + }, + {"$out": qa_temp_db_name}, + ] + ) + print(f"Stored QA data in temp db {qa_temp_db_name}") + assert await cursor.to_list() == [] + + await self.delete_crawl_pages(crawl_id, oid) + print(f"Deleted pages for crawl {crawl_id}", flush=True) + await self.add_crawl_pages_to_db_from_wacz(crawl_id) + + if not is_upload: + qa_temp_db = self.mdb[qa_temp_db_name] + cursor = qa_temp_db.aggregate( + [ + { + "$merge": { + "into": "pages", + "on": ["_id"], + "whenNotMatched": "fail", + } + } + ] + ) + print(f"Merged QA data from temp db {qa_temp_db_name}") + # async for data in qa_temp_db.find({}): + # print("qa data", data) + + assert await cursor.to_list() == [] + await qa_temp_db.drop() + print(f"Dropped temp db {qa_temp_db_name}") + # pylint: disable=broad-exception-caught + except Exception as e: + print(e) async def re_add_all_crawl_pages( self, org: Organization, crawl_type: Optional[str] = None diff --git a/chart/app-templates/background_job.yaml b/chart/app-templates/background_job.yaml index f47dd2ac..8c02f210 100644 --- a/chart/app-templates/background_job.yaml +++ b/chart/app-templates/background_job.yaml @@ -55,8 +55,8 @@ spec: resources: limits: - memory: "200Mi" + memory: "500Mi" requests: - memory: "200Mi" - cpu: "50m" + memory: "250Mi" + cpu: "200m"