Ensure crawl page counts are correct when re-adding pages (#2601)

Fixes #2600 

This PR fixes the issue by ensuring that crawl page counts (total,
unique, files, errors) are reset to 0 when crawl pages are deleted, such
as right before being re-added.

It also adds a migration will recalculates file and error page counts
for each crawl without re-adding pages from the WACZ files.
This commit is contained in:
Tessa Walsh 2025-05-13 14:05:41 -04:00 committed by GitHub
parent 594f5bc171
commit 6f81d588a9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 116 additions and 12 deletions

View File

@ -32,7 +32,7 @@ else:
) = PageOps = BackgroundJobOps = object
CURR_DB_VERSION = "0044"
CURR_DB_VERSION = "0045"
# ============================================================================

View File

@ -0,0 +1,60 @@
"""
Migration 0045 - Recalculate crawl filePageCount and errorPageCount
"""
from btrixcloud.migrations import BaseMigration
MIGRATION_VERSION = "0045"
# pylint: disable=duplicate-code
class Migration(BaseMigration):
"""Migration class."""
# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)
self.page_ops = kwargs.get("page_ops")
async def migrate_up(self):
"""Perform migration up.
Recalculate crawl filePageCount and errorPageCount for all crawls
"""
crawls_mdb = self.mdb["crawls"]
if self.page_ops is None:
print(
"Unable to reset crawl page counts, missing page_ops",
flush=True,
)
return
match_query = {
"$or": [{"errorPageCount": {"$gt": 0}}, {"filePageCount": {"$gt": 0}}]
}
async for crawl_raw in crawls_mdb.find(match_query, projection=["_id"]):
crawl_id = crawl_raw["_id"]
try:
# Reset filePageCount and errorPageCount to 0
await crawls_mdb.find_one_and_update(
{"_id": crawl_id},
{
"$set": {
"filePageCount": 0,
"errorPageCount": 0,
}
},
)
# Re-increment filePageCount and errorPageCount
await self.page_ops.update_crawl_file_and_error_counts(crawl_id)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Unable to update page counts for crawl {crawl_id}: {err}",
flush=True,
)

View File

@ -246,18 +246,25 @@ class PageOps:
await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare)
async def update_crawl_file_and_error_counts(
self, crawl_id: str, pages: List[Page]
self, crawl_id: str, pages: Optional[List[Page]] = None
):
"""Update crawl filePageCount and errorPageCount for pages."""
file_count = 0
error_count = 0
for page in pages:
if page.isFile:
file_count += 1
if page.isError:
error_count += 1
if pages is not None:
for page in pages:
if page.isFile:
file_count += 1
if page.isError:
error_count += 1
else:
# If page list not supplied, count all pages in crawl
async for page_raw in self.pages.find({"crawl_id": crawl_id}):
if page_raw.get("isFile"):
file_count += 1
if page_raw.get("isError"):
error_count += 1
if file_count == 0 and error_count == 0:
return
@ -276,7 +283,7 @@ class PageOps:
)
async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None):
"""Delete crawl pages from db"""
"""Delete crawl pages from db and clear crawl page counts"""
query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id}
if oid:
query["oid"] = oid
@ -289,6 +296,25 @@ class PageOps:
flush=True,
)
try:
await self.crawls.find_one_and_update(
{"_id": crawl_id},
{
"$set": {
"pageCount": 0,
"uniquePageCount": 0,
"filePageCount": 0,
"errorPageCount": 0,
}
},
)
# pylint: disable=broad-except
except Exception as err:
print(
f"Error resetting page counts for crawl {crawl_id}: {err}",
flush=True,
)
async def get_page_raw(
self,
page_id: UUID,

View File

@ -956,6 +956,19 @@ def test_crawl_pages_qa_filters(crawler_auth_headers, default_org_id, crawler_cr
def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
# Store page counts to compare against after re-adding
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
page_count_before = data["pageCount"]
page_count_before_unique = data["uniquePageCount"]
page_count_before_files = data["filePageCount"]
page_count_before_errors = data["errorPageCount"]
# Re-add pages and verify they were correctly added
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/reAdd",
@ -1001,15 +1014,20 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
)
assert r.status_code == 403
# Check that pageCount and uniquePageCount were stored on crawl
# Check that crawl page counts were recalculated properly
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["pageCount"] > 0
assert data["uniquePageCount"] > 0
assert data["pageCount"] > 0 and data["pageCount"] == page_count_before
assert (
data["uniquePageCount"] > 0
and data["uniquePageCount"] == page_count_before_unique
)
assert data["filePageCount"] == page_count_before_files
assert data["errorPageCount"] == page_count_before_errors
def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id):