browsertrix/backend/btrixcloud/migrations/migration_0028_page_files_errors.py

"""
Migration 0028 - Page files and errors
"""

from btrixcloud.migrations import BaseMigration
from btrixcloud.models import Page, Crawl


MIGRATION_VERSION = "0028"


class Migration(BaseMigration):
    """Migration class."""

    # pylint: disable=unused-argument
    def __init__(self, mdb, **kwargs):
        super().__init__(mdb, migration_version=MIGRATION_VERSION)

    async def migrate_up(self):
        """Perform migration up.

        Update older crawls and their pages:
        - Add crawl.filePageCount and crawl.errorPageCount
        - Set Page.isFile and Page.isError
        """
        pages_db = self.mdb["pages"]
        crawls_db = self.mdb["crawls"]

        cursor = crawls_db.find({"type": "crawl", "filePageCount": None})
        async for crawl_dict in cursor:
            try:
                crawl = Crawl.from_dict(crawl_dict)
                crawl.filePageCount = 0
                crawl.errorPageCount = 0

                cursor = pages_db.find({"crawl_id": crawl.id})
                async for page_dict in cursor:
                    page = Page.from_dict(page_dict)

                    page.compute_page_type()
                    if page.isFile:
                        crawl.filePageCount += 1

                    if page.isError:
                        crawl.errorPageCount += 1

                    if page.isFile or page.isError:
                        await pages_db.find_one_and_update(
                            {"_id": page.id},
                            {
                                "$set": page.dict(
                                    include={"isFile": True, "isError": True}
                                )
                            },
                        )

                await crawls_db.find_one_and_update(
                    {"_id": crawl.id, "type": "crawl"},
                    {
                        "$set": crawl.dict(
                            include={"filePageCount": True, "errorPageCount": True}
                        )
                    },
                )
            # pylint: disable=broad-exception-caught
            except Exception as err:
                crawl_id = crawl_dict.get("_id")
                print(
                    f"Error updating page counts and pages for crawl {crawl_id}: {err}",
                    flush=True,
                )