browsertrix/backend/btrixcloud/migrations/migration_0042_page_filenames.py
Tessa Walsh 0e9e70f3a3
Add WACZ filename, depth, favIconUrl, isSeed to pages (#2352)
Adds `filename` to pages, pointed to the WACZ file those files come
from, as well as depth, favIconUrl, and isSeed. Also adds an idempotent
migration to backfill this information for existing pages, and increases
the backend container's startupProbe time to 24 hours to give it sufficient
time to finish the migration.
---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2025-02-05 15:50:04 -05:00

51 lines
1.5 KiB
Python

"""
Migration 0042 - Add filename to pages
"""
from btrixcloud.migrations import BaseMigration
MIGRATION_VERSION = "0042"
class Migration(BaseMigration):
"""Migration class."""
# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)
self.page_ops = kwargs.get("page_ops")
async def migrate_up(self):
"""Perform migration up.
Add filename to all pages that don't currently have it stored,
iterating through each archived item and its WACZ files as necessary
"""
pages_mdb = self.mdb["pages"]
if self.page_ops is None:
print(
"Unable to add filename and other fields to pages, missing page_ops",
flush=True,
)
return
crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None})
crawl_count = len(crawl_ids_to_update)
current_index = 1
for crawl_id in crawl_ids_to_update:
print(f"Migrating archived item {current_index}/{crawl_count}", flush=True)
try:
await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Error adding filename and other fields to pages in item {crawl_id}: {err}",
flush=True,
)
current_index += 1