Backend work for #1859 - Remove file count from qa stats endpoint - Compute isFile or isError per page when page is added - Increment filePageCount and errorPageCount per crawl to count number of isFile or isError pages - Add file and error counts to crawl replay.json endpoint (filePageCount and errorPageCount) - Add migration 0028 to set isFile / isError for each page, aggregate filePageCount / errorPageCount per crawl - Determine if page is a file based on loadState == 2, mime type or status code and lack of title
72 lines
2.3 KiB
Python
72 lines
2.3 KiB
Python
"""
|
|
Migration 0028 - Page files and errors
|
|
"""
|
|
|
|
from btrixcloud.migrations import BaseMigration
|
|
from btrixcloud.models import Page, Crawl
|
|
|
|
|
|
MIGRATION_VERSION = "0028"
|
|
|
|
|
|
class Migration(BaseMigration):
|
|
"""Migration class."""
|
|
|
|
# pylint: disable=unused-argument
|
|
def __init__(self, mdb, **kwargs):
|
|
super().__init__(mdb, migration_version=MIGRATION_VERSION)
|
|
|
|
async def migrate_up(self):
|
|
"""Perform migration up.
|
|
|
|
Update older crawls and their pages:
|
|
- Add crawl.filePageCount and crawl.errorPageCount
|
|
- Set Page.isFile and Page.isError
|
|
"""
|
|
pages_db = self.mdb["pages"]
|
|
crawls_db = self.mdb["crawls"]
|
|
|
|
cursor = crawls_db.find({"type": "crawl", "filePageCount": None})
|
|
async for crawl_dict in cursor:
|
|
try:
|
|
crawl = Crawl.from_dict(crawl_dict)
|
|
crawl.filePageCount = 0
|
|
crawl.errorPageCount = 0
|
|
|
|
cursor = pages_db.find({"crawl_id": crawl.id})
|
|
async for page_dict in cursor:
|
|
page = Page.from_dict(page_dict)
|
|
|
|
page.compute_page_type()
|
|
if page.isFile:
|
|
crawl.filePageCount += 1
|
|
|
|
if page.isError:
|
|
crawl.errorPageCount += 1
|
|
|
|
if page.isFile or page.isError:
|
|
await pages_db.find_one_and_update(
|
|
{"_id": page.id},
|
|
{
|
|
"$set": page.dict(
|
|
include={"isFile": True, "isError": True}
|
|
)
|
|
},
|
|
)
|
|
|
|
await crawls_db.find_one_and_update(
|
|
{"_id": crawl.id, "type": "crawl"},
|
|
{
|
|
"$set": crawl.dict(
|
|
include={"filePageCount": True, "errorPageCount": True}
|
|
)
|
|
},
|
|
)
|
|
# pylint: disable=broad-exception-caught
|
|
except Exception as err:
|
|
crawl_id = crawl_dict.get("_id")
|
|
print(
|
|
f"Error updating page counts and pages for crawl {crawl_id}: {err}",
|
|
flush=True,
|
|
)
|