Modify page upload migration (#2400)

Related to #2396 Changes to migration 0037: - Re-adds pages in migration rather than in background job to avoid race condition with later migrations - Re-adds pages for all uploads in all orgs Fix for readd pages for org: - Ensure org filter is applied! - Fix wrong type - Remove distinct, use iterator to iterate over crawls faster. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2025-02-17 19:47:58 -05:00 · 2025-02-17 19:47:58 -05:00 · 6c2d8c88c8
commit 6c2d8c88c8
parent 629cf7c404
4 changed files with 44 additions and 43 deletions
--- a/backend/btrixcloud/migrations/migration_0037_upload_pages.py
+++ b/backend/btrixcloud/migrations/migration_0037_upload_pages.py
@ -2,9 +2,8 @@
 Migration 0037 -- upload pages
 """
 from uuid import UUID
 from btrixcloud.migrations import BaseMigration
 from btrixcloud.models import Organization, UploadedCrawl
 MIGRATION_VERSION = "0037"
@ -19,54 +18,52 @@ class Migration(BaseMigration):
        self.background_job_ops = kwargs.get("background_job_ops")
        self.page_ops = kwargs.get("page_ops")
-
+        self.coll_ops = kwargs.get("coll_ops")
    async def org_upload_pages_already_added(self, oid: UUID) -> bool:
        """Check if upload pages have already been added for this org"""
        if self.page_ops is None:
            print(
                f"page_ops missing, assuming pages need to be added for org {oid}",
                flush=True,
            )
            return False
        mdb_crawls = self.mdb["crawls"]
        async for upload in mdb_crawls.find({"oid": oid, "type": "upload"}):
            upload_id = upload["_id"]
            _, total = await self.page_ops.list_pages(upload_id)
            if total > 0:
                return True
        return False
    async def migrate_up(self):
        """Perform migration up.
        Start background jobs to parse uploads and add their pages to db
        """
-        if self.background_job_ops is None:
+        if not self.background_job_ops or not self.page_ops or not self.coll_ops:
-            print(
+            print("Unable to start migration, missing ops", flush=True)
                "Unable to start background job, missing background_job_ops", flush=True
            )
            return
        mdb_orgs = self.mdb["organizations"]
-        async for org in mdb_orgs.find():
+        mdb_crawls = self.mdb["crawls"]
            oid = org["_id"]
-            pages_already_added = await self.org_upload_pages_already_added(oid)
+        uploads_query = {"type": "upload"}
-            if pages_already_added:
+        # Re-add pages for all uploads
        upload_count = await mdb_crawls.count_documents(uploads_query)
        current_index = 1
        async for res in mdb_crawls.find(uploads_query):
            upload = UploadedCrawl.from_dict(res)
            print(
-                    f"Skipping org {oid}, upload pages already added to db", flush=True
+                f"Adding pages for upload {current_index}/{upload_count}",
                flush=True,
            )
                continue
            try:
-                await self.background_job_ops.create_re_add_org_pages_job(
+                await self.page_ops.re_add_crawl_pages(upload.id, upload.oid)
                    oid, crawl_type="upload"
                )
            # pylint: disable=broad-exception-caught
            except Exception as err:
                print(
-                    f"Error starting background job to add upload pges to org {oid}: {err}",
+                    f"Error adding pages for upload {upload.id}: {err}",
                    flush=True,
                )
            current_index += 1
        # Update collections to account for new pages
        async for org_dict in mdb_orgs.find({}):
            org = Organization.from_dict(org_dict)
            try:
                await self.coll_ops.recalculate_org_collection_dates(org)
                await self.coll_ops.recalculate_org_collection_counts_tags(org)
            # pylint: disable=broad-exception-caught
            except Exception as err:
                print(
                    f"Error updating collections after adding pages for org {org.id}: {err}",
                    flush=True,
                )
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -1287,7 +1287,7 @@ class Page(BaseMongoModel):
    mime: Optional[str] = None
    filename: Optional[str] = None
    depth: Optional[int] = None
-    favIconUrl: Optional[AnyHttpUrl] = None
+    favIconUrl: Optional[str] = None
    isSeed: Optional[bool] = False
    # manual review
--- a/backend/btrixcloud/ops.py
+++ b/backend/btrixcloud/ops.py
@ -70,7 +70,7 @@ def init_ops() -> Tuple[
        profile_ops,
    )
-    coll_ops = CollectionOps(mdb, crawl_manager, org_ops, event_webhook_ops)
+    coll_ops = CollectionOps(mdb, storage_ops, org_ops, event_webhook_ops)
    base_crawl_init = (
        mdb,
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@ -841,8 +841,6 @@ class PageOps:
                    ]
                )
                print(f"Merged QA data from temp db {qa_temp_db_name}")
                # async for data in qa_temp_db.find({}):
                #    print("qa data", data)
                assert await cursor.to_list() == []
                await qa_temp_db.drop()
@ -855,13 +853,19 @@ class PageOps:
        self, org: Organization, crawl_type: Optional[str] = None
    ):
        """Re-add pages for all crawls and uploads in org"""
-        match_query: Dict[str, object] = {"finished": {"$ne": None}}
+        match_query: Dict[str, Union[object, UUID]] = {
            "oid": org.id,
            "finished": {"$ne": None},
        }
        if crawl_type in ("crawl", "upload"):
            match_query["type"] = crawl_type
-        crawl_ids = await self.crawls.distinct("_id", match_query)
+        count = 1
-        for crawl_id in crawl_ids:
+        total = await self.crawls.count_documents(match_query)
-            await self.re_add_crawl_pages(crawl_id, org.id)
+        async for crawl in self.crawls.find(match_query, projection={"_id": 1}):
            print(f"Processing crawl {count} of {total}")
            await self.re_add_crawl_pages(crawl.get("_id"), org.id)
            count += 1
    async def get_qa_run_aggregate_counts(
        self,