browsertrix/backend/btrixcloud/migrations/migration_0045_crawl_counts.py
Tessa Walsh 6f81d588a9
Ensure crawl page counts are correct when re-adding pages (#2601)
Fixes #2600 

This PR fixes the issue by ensuring that crawl page counts (total,
unique, files, errors) are reset to 0 when crawl pages are deleted, such
as right before being re-added.

It also adds a migration will recalculates file and error page counts
for each crawl without re-adding pages from the WACZ files.
2025-05-13 14:05:41 -04:00

61 lines
1.8 KiB
Python

"""
Migration 0045 - Recalculate crawl filePageCount and errorPageCount
"""
from btrixcloud.migrations import BaseMigration
MIGRATION_VERSION = "0045"
# pylint: disable=duplicate-code
class Migration(BaseMigration):
"""Migration class."""
# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)
self.page_ops = kwargs.get("page_ops")
async def migrate_up(self):
"""Perform migration up.
Recalculate crawl filePageCount and errorPageCount for all crawls
"""
crawls_mdb = self.mdb["crawls"]
if self.page_ops is None:
print(
"Unable to reset crawl page counts, missing page_ops",
flush=True,
)
return
match_query = {
"$or": [{"errorPageCount": {"$gt": 0}}, {"filePageCount": {"$gt": 0}}]
}
async for crawl_raw in crawls_mdb.find(match_query, projection=["_id"]):
crawl_id = crawl_raw["_id"]
try:
# Reset filePageCount and errorPageCount to 0
await crawls_mdb.find_one_and_update(
{"_id": crawl_id},
{
"$set": {
"filePageCount": 0,
"errorPageCount": 0,
}
},
)
# Re-increment filePageCount and errorPageCount
await self.page_ops.update_crawl_file_and_error_counts(crawl_id)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Unable to update page counts for crawl {crawl_id}: {err}",
flush=True,
)