Issue 2396 readd pages fixes (#2398)

readd pages fixes: - add additional mem to background job - copy page qa data to separate temp coll when re-adding pages, then merge back in
2025-02-17 13:52:11 -08:00 · 2025-02-17 13:52:11 -08:00 · 5bebb6161a
commit 5bebb6161a
parent e112f96614
3 changed files with 75 additions and 6 deletions
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -646,6 +646,13 @@ class CrawlOps(BaseCrawlOps):
            return None, None
        return res.get("state"), res.get("finished")
    async def is_upload(self, crawl_id: str):
        """return true if archived item with this id is an upload"""
        res = await self.crawls.find_one({"_id": crawl_id}, projection={"type": 1})
        if not res:
            return False
        return res.get("type") == "upload"
    async def add_crawl_error(
        self,
        crawl_id: str,
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@ -66,6 +66,7 @@ class PageOps:
    ):
        self.pages = mdb["pages"]
        self.crawls = mdb["crawls"]
        self.mdb = mdb
        self.crawl_ops = crawl_ops
        self.org_ops = org_ops
        self.storage_ops = storage_ops
@ -785,9 +786,70 @@ class PageOps:
    async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
        """Delete existing pages for crawl and re-add from WACZs."""
-        await self.delete_crawl_pages(crawl_id, oid)
+
-        print(f"Deleted pages for crawl {crawl_id}", flush=True)
+        try:
-        await self.add_crawl_pages_to_db_from_wacz(crawl_id)
+            is_upload = await self.crawl_ops.is_upload(crawl_id)
            print(f"Processing {'upload' if is_upload else 'crawl'} {crawl_id}")
            if not is_upload:
                ts_now = dt_now().strftime("%Y%m%d%H%M%S")
                qa_temp_db_name = f"pages-qa-temp-{crawl_id}-{ts_now}"
                cursor = self.pages.aggregate(
                    [
                        {
                            "$match": {
                                "crawl_id": crawl_id,
                                "$or": [
                                    {"qa": {"$nin": [None, {}]}},
                                    {"modified": {"$ne": None}},
                                    {"userid": {"$ne": None}},
                                    {"approved": {"$ne": None}},
                                    {"notes": {"$ne": None}},
                                ],
                            }
                        },
                        {
                            "$project": {
                                "_id": 1,
                                "qa": 1,
                                "modified": 1,
                                "userid": 1,
                                "approved": 1,
                                "notes": 1,
                            }
                        },
                        {"$out": qa_temp_db_name},
                    ]
                )
                print(f"Stored QA data in temp db {qa_temp_db_name}")
                assert await cursor.to_list() == []
            await self.delete_crawl_pages(crawl_id, oid)
            print(f"Deleted pages for crawl {crawl_id}", flush=True)
            await self.add_crawl_pages_to_db_from_wacz(crawl_id)
            if not is_upload:
                qa_temp_db = self.mdb[qa_temp_db_name]
                cursor = qa_temp_db.aggregate(
                    [
                        {
                            "$merge": {
                                "into": "pages",
                                "on": ["_id"],
                                "whenNotMatched": "fail",
                            }
                        }
                    ]
                )
                print(f"Merged QA data from temp db {qa_temp_db_name}")
                # async for data in qa_temp_db.find({}):
                #    print("qa data", data)
                assert await cursor.to_list() == []
                await qa_temp_db.drop()
                print(f"Dropped temp db {qa_temp_db_name}")
        # pylint: disable=broad-exception-caught
        except Exception as e:
            print(e)
    async def re_add_all_crawl_pages(
        self, org: Organization, crawl_type: Optional[str] = None
--- a/chart/app-templates/background_job.yaml
+++ b/chart/app-templates/background_job.yaml
@ -55,8 +55,8 @@ spec:
          resources:
            limits:
-              memory: "200Mi"
+              memory: "500Mi"
            requests:
-              memory: "200Mi"
+              memory: "250Mi"
-              cpu: "50m"
+              cpu: "200m"