Issue 2396 readd pages fixes (#2398)

readd pages fixes: - add additional mem to background job - copy page qa data to separate temp coll when re-adding pages, then merge back in
2025-02-17 13:52:11 -08:00 · 2025-02-17 13:52:11 -08:00 · 5bebb6161a
commit 5bebb6161a
parent e112f96614
3 changed files with 75 additions and 6 deletions
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -646,6 +646,13 @@ class CrawlOps(BaseCrawlOps):
            return None, None
        return res.get("state"), res.get("finished")

+    async def is_upload(self, crawl_id: str):
+        """return true if archived item with this id is an upload"""
+        res = await self.crawls.find_one({"_id": crawl_id}, projection={"type": 1})
+        if not res:
+            return False
+        return res.get("type") == "upload"
+
    async def add_crawl_error(
        self,
        crawl_id: str,
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@ -66,6 +66,7 @@ class PageOps:
    ):
        self.pages = mdb["pages"]
        self.crawls = mdb["crawls"]
+        self.mdb = mdb
        self.crawl_ops = crawl_ops
        self.org_ops = org_ops
        self.storage_ops = storage_ops
@ -785,9 +786,70 @@ class PageOps:

    async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
        """Delete existing pages for crawl and re-add from WACZs."""
-        await self.delete_crawl_pages(crawl_id, oid)
-        print(f"Deleted pages for crawl {crawl_id}", flush=True)
-        await self.add_crawl_pages_to_db_from_wacz(crawl_id)
+
+        try:
+            is_upload = await self.crawl_ops.is_upload(crawl_id)
+            print(f"Processing {'upload' if is_upload else 'crawl'} {crawl_id}")
+            if not is_upload:
+                ts_now = dt_now().strftime("%Y%m%d%H%M%S")
+                qa_temp_db_name = f"pages-qa-temp-{crawl_id}-{ts_now}"
+                cursor = self.pages.aggregate(
+                    [
+                        {
+                            "$match": {
+                                "crawl_id": crawl_id,
+                                "$or": [
+                                    {"qa": {"$nin": [None, {}]}},
+                                    {"modified": {"$ne": None}},
+                                    {"userid": {"$ne": None}},
+                                    {"approved": {"$ne": None}},
+                                    {"notes": {"$ne": None}},
+                                ],
+                            }
+                        },
+                        {
+                            "$project": {
+                                "_id": 1,
+                                "qa": 1,
+                                "modified": 1,
+                                "userid": 1,
+                                "approved": 1,
+                                "notes": 1,
+                            }
+                        },
+                        {"$out": qa_temp_db_name},
+                    ]
+                )
+                print(f"Stored QA data in temp db {qa_temp_db_name}")
+                assert await cursor.to_list() == []
+
+            await self.delete_crawl_pages(crawl_id, oid)
+            print(f"Deleted pages for crawl {crawl_id}", flush=True)
+            await self.add_crawl_pages_to_db_from_wacz(crawl_id)
+
+            if not is_upload:
+                qa_temp_db = self.mdb[qa_temp_db_name]
+                cursor = qa_temp_db.aggregate(
+                    [
+                        {
+                            "$merge": {
+                                "into": "pages",
+                                "on": ["_id"],
+                                "whenNotMatched": "fail",
+                            }
+                        }
+                    ]
+                )
+                print(f"Merged QA data from temp db {qa_temp_db_name}")
+                # async for data in qa_temp_db.find({}):
+                #    print("qa data", data)
+
+                assert await cursor.to_list() == []
+                await qa_temp_db.drop()
+                print(f"Dropped temp db {qa_temp_db_name}")
+        # pylint: disable=broad-exception-caught
+        except Exception as e:
+            print(e)

    async def re_add_all_crawl_pages(
        self, org: Organization, crawl_type: Optional[str] = None
--- a/chart/app-templates/background_job.yaml
+++ b/chart/app-templates/background_job.yaml
@ -55,8 +55,8 @@ spec:

          resources:
            limits:
-              memory: "200Mi"
+              memory: "500Mi"

            requests:
-              memory: "200Mi"
-              cpu: "50m"
+              memory: "250Mi"
+              cpu: "200m"