Ensure crawl page counts are correct when re-adding pages (#2601)

Fixes #2600 This PR fixes the issue by ensuring that crawl page counts (total, unique, files, errors) are reset to 0 when crawl pages are deleted, such as right before being re-added. It also adds a migration will recalculates file and error page counts for each crawl without re-adding pages from the WACZ files.
2025-05-13 14:05:41 -04:00 · 2025-05-13 14:05:41 -04:00 · 6f81d588a9
commit 6f81d588a9
parent 594f5bc171
4 changed files with 116 additions and 12 deletions
--- a/backend/btrixcloud/db.py
+++ b/backend/btrixcloud/db.py
@ -32,7 +32,7 @@ else:
    ) = PageOps = BackgroundJobOps = object
-CURR_DB_VERSION = "0044"
+CURR_DB_VERSION = "0045"
 # ============================================================================
--- a/backend/btrixcloud/migrations/migration_0045_crawl_counts.py
+++ b/backend/btrixcloud/migrations/migration_0045_crawl_counts.py
@ -0,0 +1,60 @@
 """
 Migration 0045 - Recalculate crawl filePageCount and errorPageCount
 """
 from btrixcloud.migrations import BaseMigration
 MIGRATION_VERSION = "0045"
 # pylint: disable=duplicate-code
 class Migration(BaseMigration):
    """Migration class."""
    # pylint: disable=unused-argument
    def __init__(self, mdb, **kwargs):
        super().__init__(mdb, migration_version=MIGRATION_VERSION)
        self.page_ops = kwargs.get("page_ops")
    async def migrate_up(self):
        """Perform migration up.
        Recalculate crawl filePageCount and errorPageCount for all crawls
        """
        crawls_mdb = self.mdb["crawls"]
        if self.page_ops is None:
            print(
                "Unable to reset crawl page counts, missing page_ops",
                flush=True,
            )
            return
        match_query = {
            "$or": [{"errorPageCount": {"$gt": 0}}, {"filePageCount": {"$gt": 0}}]
        }
        async for crawl_raw in crawls_mdb.find(match_query, projection=["_id"]):
            crawl_id = crawl_raw["_id"]
            try:
                # Reset filePageCount and errorPageCount to 0
                await crawls_mdb.find_one_and_update(
                    {"_id": crawl_id},
                    {
                        "$set": {
                            "filePageCount": 0,
                            "errorPageCount": 0,
                        }
                    },
                )
                # Re-increment filePageCount and errorPageCount
                await self.page_ops.update_crawl_file_and_error_counts(crawl_id)
            # pylint: disable=broad-exception-caught
            except Exception as err:
                print(
                    f"Unable to update page counts for crawl {crawl_id}: {err}",
                    flush=True,
                )
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@ -246,18 +246,25 @@ class PageOps:
            await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare)
    async def update_crawl_file_and_error_counts(
-        self, crawl_id: str, pages: List[Page]
+        self, crawl_id: str, pages: Optional[List[Page]] = None
    ):
        """Update crawl filePageCount and errorPageCount for pages."""
        file_count = 0
        error_count = 0
        if pages is not None:
            for page in pages:
                if page.isFile:
                    file_count += 1
                if page.isError:
                    error_count += 1
        else:
            # If page list not supplied, count all pages in crawl
            async for page_raw in self.pages.find({"crawl_id": crawl_id}):
                if page_raw.get("isFile"):
                    file_count += 1
                if page_raw.get("isError"):
                    error_count += 1
        if file_count == 0 and error_count == 0:
            return
@ -276,7 +283,7 @@ class PageOps:
        )
    async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None):
-        """Delete crawl pages from db"""
+        """Delete crawl pages from db and clear crawl page counts"""
        query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id}
        if oid:
            query["oid"] = oid
@ -289,6 +296,25 @@ class PageOps:
                flush=True,
            )
        try:
            await self.crawls.find_one_and_update(
                {"_id": crawl_id},
                {
                    "$set": {
                        "pageCount": 0,
                        "uniquePageCount": 0,
                        "filePageCount": 0,
                        "errorPageCount": 0,
                    }
                },
            )
        # pylint: disable=broad-except
        except Exception as err:
            print(
                f"Error resetting page counts for crawl {crawl_id}: {err}",
                flush=True,
            )
    async def get_page_raw(
        self,
        page_id: UUID,
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@ -956,6 +956,19 @@ def test_crawl_pages_qa_filters(crawler_auth_headers, default_org_id, crawler_cr
 def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
    # Store page counts to compare against after re-adding
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
    data = r.json()
    page_count_before = data["pageCount"]
    page_count_before_unique = data["uniquePageCount"]
    page_count_before_files = data["filePageCount"]
    page_count_before_errors = data["errorPageCount"]
    # Re-add pages and verify they were correctly added
    r = requests.post(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/reAdd",
@ -1001,15 +1014,20 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
    )
    assert r.status_code == 403
-    # Check that pageCount and uniquePageCount were stored on crawl
+    # Check that crawl page counts were recalculated properly
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
    data = r.json()
-    assert data["pageCount"] > 0
+    assert data["pageCount"] > 0 and data["pageCount"] == page_count_before
-    assert data["uniquePageCount"] > 0
+    assert (
        data["uniquePageCount"] > 0
        and data["uniquePageCount"] == page_count_before_unique
    )
    assert data["filePageCount"] == page_count_before_files
    assert data["errorPageCount"] == page_count_before_errors
 def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id):