browsertrix/backend/btrixcloud/migrations/migration_0026_crawl_pages.py

"""
Migration 0026 -- Crawl Pages
"""

from btrixcloud.migrations import BaseMigration
from btrixcloud.utils import gather_tasks_with_concurrency


MIGRATION_VERSION = "0026"


class Migration(BaseMigration):
    """Migration class."""

    def __init__(self, mdb, **kwargs):
        super().__init__(mdb, migration_version=MIGRATION_VERSION)
        self.page_ops = kwargs["page_ops"]

    async def migrate_up(self):
        """Perform migration up.

        Add pages to database for each crawl without them, pulling from WACZ files.
        """
        # pylint: disable=duplicate-code
        crawls_mdb = self.mdb["crawls"]
        pages_mdb = self.mdb["pages"]

        crawl_ids = await crawls_mdb.distinct(
            "_id", {"type": "crawl", "finished": {"$ne": None}}
        )
        crawl_ids_with_pages = await pages_mdb.distinct("crawl_id")

        crawl_ids_no_pages = list(set(crawl_ids) - set(crawl_ids_with_pages))
        if not crawl_ids_no_pages:
            return

        all_coroutines = []

        for crawl_id in crawl_ids_no_pages:
            current_coroutine = self.page_ops.add_crawl_pages_to_db_from_wacz(crawl_id)
            all_coroutines.append(current_coroutine)

        try:
            await gather_tasks_with_concurrency(*all_coroutines)
        # pylint: disable=broad-exception-caught, raise-missing-from
        except Exception as err:
            print(f"Error adding pages to db: {err}", flush=True)