Ensure crawl page counts are correct when re-adding pages (#2601)

Fixes #2600 This PR fixes the issue by ensuring that crawl page counts (total, unique, files, errors) are reset to 0 when crawl pages are deleted, such as right before being re-added. It also adds a migration will recalculates file and error page counts for each crawl without re-adding pages from the WACZ files.
2025-05-13 14:05:41 -04:00 · 2025-05-13 14:05:41 -04:00 · 6f81d588a9
commit 6f81d588a9
parent 594f5bc171
4 changed files with 116 additions and 12 deletions
--- a/backend/btrixcloud/db.py
+++ b/backend/btrixcloud/db.py
@ -32,7 +32,7 @@ else:
    ) = PageOps = BackgroundJobOps = object


-CURR_DB_VERSION = "0044"
+CURR_DB_VERSION = "0045"


 # ============================================================================
--- a/backend/btrixcloud/migrations/migration_0045_crawl_counts.py
+++ b/backend/btrixcloud/migrations/migration_0045_crawl_counts.py
@ -0,0 +1,60 @@
+"""
+Migration 0045 - Recalculate crawl filePageCount and errorPageCount
+"""
+
+from btrixcloud.migrations import BaseMigration
+
+
+MIGRATION_VERSION = "0045"
+
+
+# pylint: disable=duplicate-code
+class Migration(BaseMigration):
+    """Migration class."""
+
+    # pylint: disable=unused-argument
+    def __init__(self, mdb, **kwargs):
+        super().__init__(mdb, migration_version=MIGRATION_VERSION)
+
+        self.page_ops = kwargs.get("page_ops")
+
+    async def migrate_up(self):
+        """Perform migration up.
+
+        Recalculate crawl filePageCount and errorPageCount for all crawls
+        """
+        crawls_mdb = self.mdb["crawls"]
+
+        if self.page_ops is None:
+            print(
+                "Unable to reset crawl page counts, missing page_ops",
+                flush=True,
+            )
+            return
+
+        match_query = {
+            "$or": [{"errorPageCount": {"$gt": 0}}, {"filePageCount": {"$gt": 0}}]
+        }
+        async for crawl_raw in crawls_mdb.find(match_query, projection=["_id"]):
+            crawl_id = crawl_raw["_id"]
+
+            try:
+                # Reset filePageCount and errorPageCount to 0
+                await crawls_mdb.find_one_and_update(
+                    {"_id": crawl_id},
+                    {
+                        "$set": {
+                            "filePageCount": 0,
+                            "errorPageCount": 0,
+                        }
+                    },
+                )
+
+                # Re-increment filePageCount and errorPageCount
+                await self.page_ops.update_crawl_file_and_error_counts(crawl_id)
+            # pylint: disable=broad-exception-caught
+            except Exception as err:
+                print(
+                    f"Unable to update page counts for crawl {crawl_id}: {err}",
+                    flush=True,
+                )
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@ -246,18 +246,25 @@ class PageOps:
            await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare)

    async def update_crawl_file_and_error_counts(
-        self, crawl_id: str, pages: List[Page]
+        self, crawl_id: str, pages: Optional[List[Page]] = None
    ):
        """Update crawl filePageCount and errorPageCount for pages."""
        file_count = 0
        error_count = 0

-        for page in pages:
-            if page.isFile:
-                file_count += 1
-
-            if page.isError:
-                error_count += 1
+        if pages is not None:
+            for page in pages:
+                if page.isFile:
+                    file_count += 1
+                if page.isError:
+                    error_count += 1
+        else:
+            # If page list not supplied, count all pages in crawl
+            async for page_raw in self.pages.find({"crawl_id": crawl_id}):
+                if page_raw.get("isFile"):
+                    file_count += 1
+                if page_raw.get("isError"):
+                    error_count += 1

        if file_count == 0 and error_count == 0:
            return
@ -276,7 +283,7 @@ class PageOps:
        )

    async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None):
-        """Delete crawl pages from db"""
+        """Delete crawl pages from db and clear crawl page counts"""
        query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id}
        if oid:
            query["oid"] = oid
@ -289,6 +296,25 @@ class PageOps:
                flush=True,
            )

+        try:
+            await self.crawls.find_one_and_update(
+                {"_id": crawl_id},
+                {
+                    "$set": {
+                        "pageCount": 0,
+                        "uniquePageCount": 0,
+                        "filePageCount": 0,
+                        "errorPageCount": 0,
+                    }
+                },
+            )
+        # pylint: disable=broad-except
+        except Exception as err:
+            print(
+                f"Error resetting page counts for crawl {crawl_id}: {err}",
+                flush=True,
+            )
+
    async def get_page_raw(
        self,
        page_id: UUID,
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@ -956,6 +956,19 @@ def test_crawl_pages_qa_filters(crawler_auth_headers, default_org_id, crawler_cr


 def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
+    # Store page counts to compare against after re-adding
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    page_count_before = data["pageCount"]
+    page_count_before_unique = data["uniquePageCount"]
+    page_count_before_files = data["filePageCount"]
+    page_count_before_errors = data["errorPageCount"]
+
    # Re-add pages and verify they were correctly added
    r = requests.post(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/reAdd",
@ -1001,15 +1014,20 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
    )
    assert r.status_code == 403

-    # Check that pageCount and uniquePageCount were stored on crawl
+    # Check that crawl page counts were recalculated properly
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
    data = r.json()
-    assert data["pageCount"] > 0
-    assert data["uniquePageCount"] > 0
+    assert data["pageCount"] > 0 and data["pageCount"] == page_count_before
+    assert (
+        data["uniquePageCount"] > 0
+        and data["uniquePageCount"] == page_count_before_unique
+    )
+    assert data["filePageCount"] == page_count_before_files
+    assert data["errorPageCount"] == page_count_before_errors


 def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id):