Fixes #2600 This PR fixes the issue by ensuring that crawl page counts (total, unique, files, errors) are reset to 0 when crawl pages are deleted, such as right before being re-added. It also adds a migration will recalculates file and error page counts for each crawl without re-adding pages from the WACZ files.
61 lines
1.8 KiB
Python
61 lines
1.8 KiB
Python
"""
|
|
Migration 0045 - Recalculate crawl filePageCount and errorPageCount
|
|
"""
|
|
|
|
from btrixcloud.migrations import BaseMigration
|
|
|
|
|
|
MIGRATION_VERSION = "0045"
|
|
|
|
|
|
# pylint: disable=duplicate-code
|
|
class Migration(BaseMigration):
|
|
"""Migration class."""
|
|
|
|
# pylint: disable=unused-argument
|
|
def __init__(self, mdb, **kwargs):
|
|
super().__init__(mdb, migration_version=MIGRATION_VERSION)
|
|
|
|
self.page_ops = kwargs.get("page_ops")
|
|
|
|
async def migrate_up(self):
|
|
"""Perform migration up.
|
|
|
|
Recalculate crawl filePageCount and errorPageCount for all crawls
|
|
"""
|
|
crawls_mdb = self.mdb["crawls"]
|
|
|
|
if self.page_ops is None:
|
|
print(
|
|
"Unable to reset crawl page counts, missing page_ops",
|
|
flush=True,
|
|
)
|
|
return
|
|
|
|
match_query = {
|
|
"$or": [{"errorPageCount": {"$gt": 0}}, {"filePageCount": {"$gt": 0}}]
|
|
}
|
|
async for crawl_raw in crawls_mdb.find(match_query, projection=["_id"]):
|
|
crawl_id = crawl_raw["_id"]
|
|
|
|
try:
|
|
# Reset filePageCount and errorPageCount to 0
|
|
await crawls_mdb.find_one_and_update(
|
|
{"_id": crawl_id},
|
|
{
|
|
"$set": {
|
|
"filePageCount": 0,
|
|
"errorPageCount": 0,
|
|
}
|
|
},
|
|
)
|
|
|
|
# Re-increment filePageCount and errorPageCount
|
|
await self.page_ops.update_crawl_file_and_error_counts(crawl_id)
|
|
# pylint: disable=broad-exception-caught
|
|
except Exception as err:
|
|
print(
|
|
f"Unable to update page counts for crawl {crawl_id}: {err}",
|
|
flush=True,
|
|
)
|