browsertrix/backend/btrixcloud/migrations/migration_0028_page_files_errors.py
Tessa Walsh 879e509b39 Backend: Move page file and error counts to crawl replay.json endpoint (#1868)
Backend work for #1859

- Remove file count from qa stats endpoint
- Compute isFile or isError per page when page is added
- Increment filePageCount and errorPageCount per crawl to count number of isFile or isError pages
- Add file and error counts to crawl replay.json endpoint (filePageCount and errorPageCount)
- Add migration 0028 to set isFile / isError for each page, aggregate filePageCount / errorPageCount per crawl
- Determine if page is a file based on loadState == 2, mime type or status code and lack of title
2024-06-20 19:02:57 -07:00

72 lines
2.3 KiB
Python

"""
Migration 0028 - Page files and errors
"""
from btrixcloud.migrations import BaseMigration
from btrixcloud.models import Page, Crawl
MIGRATION_VERSION = "0028"
class Migration(BaseMigration):
"""Migration class."""
# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
Update older crawls and their pages:
- Add crawl.filePageCount and crawl.errorPageCount
- Set Page.isFile and Page.isError
"""
pages_db = self.mdb["pages"]
crawls_db = self.mdb["crawls"]
cursor = crawls_db.find({"type": "crawl", "filePageCount": None})
async for crawl_dict in cursor:
try:
crawl = Crawl.from_dict(crawl_dict)
crawl.filePageCount = 0
crawl.errorPageCount = 0
cursor = pages_db.find({"crawl_id": crawl.id})
async for page_dict in cursor:
page = Page.from_dict(page_dict)
page.compute_page_type()
if page.isFile:
crawl.filePageCount += 1
if page.isError:
crawl.errorPageCount += 1
if page.isFile or page.isError:
await pages_db.find_one_and_update(
{"_id": page.id},
{
"$set": page.dict(
include={"isFile": True, "isError": True}
)
},
)
await crawls_db.find_one_and_update(
{"_id": crawl.id, "type": "crawl"},
{
"$set": crawl.dict(
include={"filePageCount": True, "errorPageCount": True}
)
},
)
# pylint: disable=broad-exception-caught
except Exception as err:
crawl_id = crawl_dict.get("_id")
print(
f"Error updating page counts and pages for crawl {crawl_id}: {err}",
flush=True,
)