browsertrix/backend/btrixcloud/migrations/migration_0037_upload_pages.py

"""
Migration 0037 -- upload pages
"""

from btrixcloud.migrations import BaseMigration
from btrixcloud.models import Organization, UploadedCrawl


MIGRATION_VERSION = "0037"


class Migration(BaseMigration):
    """Migration class."""

    # pylint: disable=unused-argument
    def __init__(self, mdb, **kwargs):
        super().__init__(mdb, migration_version=MIGRATION_VERSION)

        self.background_job_ops = kwargs.get("background_job_ops")
        self.page_ops = kwargs.get("page_ops")
        self.coll_ops = kwargs.get("coll_ops")

    async def migrate_up(self):
        """Perform migration up.

        Start background jobs to parse uploads and add their pages to db
        """
        if not self.background_job_ops or not self.page_ops or not self.coll_ops:
            print("Unable to start migration, missing ops", flush=True)
            return

        mdb_orgs = self.mdb["organizations"]
        mdb_crawls = self.mdb["crawls"]

        uploads_query = {"type": "upload"}

        # Re-add pages for all uploads
        upload_count = await mdb_crawls.count_documents(uploads_query)
        current_index = 1

        async for res in mdb_crawls.find(uploads_query):
            upload = UploadedCrawl.from_dict(res)
            print(
                f"Adding pages for upload {current_index}/{upload_count}",
                flush=True,
            )

            try:
                await self.page_ops.re_add_crawl_pages(upload.id, upload.oid)
            # pylint: disable=broad-exception-caught
            except Exception as err:
                print(
                    f"Error adding pages for upload {upload.id}: {err}",
                    flush=True,
                )
            current_index += 1

        # Update collections to account for new pages
        async for org_dict in mdb_orgs.find({}):
            org = Organization.from_dict(org_dict)
            try:
                await self.coll_ops.recalculate_org_collection_stats(org)
            # pylint: disable=broad-exception-caught
            except Exception as err:
                print(
                    f"Error updating collections after adding pages for org {org.id}: {err}",
                    flush=True,
                )