Adds `filename` to pages, pointed to the WACZ file those files come from, as well as depth, favIconUrl, and isSeed. Also adds an idempotent migration to backfill this information for existing pages, and increases the backend container's startupProbe time to 24 hours to give it sufficient time to finish the migration. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
51 lines
1.5 KiB
Python
51 lines
1.5 KiB
Python
"""
|
|
Migration 0042 - Add filename to pages
|
|
"""
|
|
|
|
from btrixcloud.migrations import BaseMigration
|
|
|
|
|
|
MIGRATION_VERSION = "0042"
|
|
|
|
|
|
class Migration(BaseMigration):
|
|
"""Migration class."""
|
|
|
|
# pylint: disable=unused-argument
|
|
def __init__(self, mdb, **kwargs):
|
|
super().__init__(mdb, migration_version=MIGRATION_VERSION)
|
|
|
|
self.page_ops = kwargs.get("page_ops")
|
|
|
|
async def migrate_up(self):
|
|
"""Perform migration up.
|
|
|
|
Add filename to all pages that don't currently have it stored,
|
|
iterating through each archived item and its WACZ files as necessary
|
|
"""
|
|
pages_mdb = self.mdb["pages"]
|
|
|
|
if self.page_ops is None:
|
|
print(
|
|
"Unable to add filename and other fields to pages, missing page_ops",
|
|
flush=True,
|
|
)
|
|
return
|
|
|
|
crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None})
|
|
|
|
crawl_count = len(crawl_ids_to_update)
|
|
current_index = 1
|
|
|
|
for crawl_id in crawl_ids_to_update:
|
|
print(f"Migrating archived item {current_index}/{crawl_count}", flush=True)
|
|
try:
|
|
await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id)
|
|
# pylint: disable=broad-exception-caught
|
|
except Exception as err:
|
|
print(
|
|
f"Error adding filename and other fields to pages in item {crawl_id}: {err}",
|
|
flush=True,
|
|
)
|
|
current_index += 1
|