Backend: Move page file and error counts to crawl replay.json endpoint (#1868)

Backend work for #1859 - Remove file count from qa stats endpoint - Compute isFile or isError per page when page is added - Increment filePageCount and errorPageCount per crawl to count number of isFile or isError pages - Add file and error counts to crawl replay.json endpoint (filePageCount and errorPageCount) - Add migration 0028 to set isFile / isError for each page, aggregate filePageCount / errorPageCount per crawl - Determine if page is a file based on loadState == 2, mime type or status code and lack of title
2024-06-11 12:09:58 -04:00 · 2024-06-11 12:09:58 -04:00 · 879e509b39
commit 879e509b39
parent 16a720c685
5 changed files with 163 additions and 38 deletions
--- a/backend/btrixcloud/db.py
+++ b/backend/btrixcloud/db.py
@ -17,7 +17,7 @@ from pymongo.errors import InvalidName
 from .migrations import BaseMigration
-CURR_DB_VERSION = "0027"
+CURR_DB_VERSION = "0028"
 # ============================================================================
--- a/backend/btrixcloud/migrations/migration_0028_page_files_errors.py
+++ b/backend/btrixcloud/migrations/migration_0028_page_files_errors.py
@ -0,0 +1,71 @@
 """
 Migration 0028 - Page files and errors
 """
 from btrixcloud.migrations import BaseMigration
 from btrixcloud.models import Page, Crawl
 MIGRATION_VERSION = "0028"
 class Migration(BaseMigration):
    """Migration class."""
    # pylint: disable=unused-argument
    def __init__(self, mdb, **kwargs):
        super().__init__(mdb, migration_version=MIGRATION_VERSION)
    async def migrate_up(self):
        """Perform migration up.
        Update older crawls and their pages:
        - Add crawl.filePageCount and crawl.errorPageCount
        - Set Page.isFile and Page.isError
        """
        pages_db = self.mdb["pages"]
        crawls_db = self.mdb["crawls"]
        cursor = crawls_db.find({"type": "crawl", "filePageCount": None})
        async for crawl_dict in cursor:
            try:
                crawl = Crawl.from_dict(crawl_dict)
                crawl.filePageCount = 0
                crawl.errorPageCount = 0
                cursor = pages_db.find({"crawl_id": crawl.id})
                async for page_dict in cursor:
                    page = Page.from_dict(page_dict)
                    page.compute_page_type()
                    if page.isFile:
                        crawl.filePageCount += 1
                    if page.isError:
                        crawl.errorPageCount += 1
                    if page.isFile or page.isError:
                        await pages_db.find_one_and_update(
                            {"_id": page.id},
                            {
                                "$set": page.dict(
                                    include={"isFile": True, "isError": True}
                                )
                            },
                        )
                await crawls_db.find_one_and_update(
                    {"_id": crawl.id, "type": "crawl"},
                    {
                        "$set": crawl.dict(
                            include={"filePageCount": True, "errorPageCount": True}
                        )
                    },
                )
            # pylint: disable=broad-exception-caught
            except Exception as err:
                crawl_id = crawl_dict.get("_id")
                print(
                    f"Error updating page counts and pages for crawl {crawl_id}: {err}",
                    flush=True,
                )
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -666,6 +666,9 @@ class CrawlOut(BaseMongoModel):
    lastQAState: Optional[str]
    lastQAStarted: Optional[datetime]
    filePageCount: Optional[int] = 0
    errorPageCount: Optional[int] = 0
 # ============================================================================
 class CrawlOutWithResources(CrawlOut):
@ -780,6 +783,9 @@ class Crawl(BaseCrawl, CrawlConfigCore):
    qa: Optional[QARun] = None
    qaFinished: Optional[Dict[str, QARun]] = {}
    filePageCount: Optional[int] = 0
    errorPageCount: Optional[int] = 0
 # ============================================================================
 class CrawlCompleteIn(BaseModel):
@ -1567,6 +1573,23 @@ class Page(BaseMongoModel):
    approved: Optional[bool] = None
    notes: List[PageNote] = []
    isFile: Optional[bool] = False
    isError: Optional[bool] = False
    def compute_page_type(self):
        """sets self.isFile or self.isError flags"""
        self.isFile = False
        self.isError = False
        if self.loadState == 2:
            # pylint: disable=unsupported-membership-test
            if self.mime and "html" not in self.mime:
                self.isFile = True
            elif self.title is None and self.status == 200:
                self.isFile = True
        elif self.loadState == 0:
            self.isError = True
 # ============================================================================
 class PageWithAllQA(Page):
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@ -36,7 +36,7 @@ else:
 # ============================================================================
-# pylint: disable=too-many-instance-attributes, too-many-arguments
+# pylint: disable=too-many-instance-attributes, too-many-arguments,too-many-public-methods
 class PageOps:
    """crawl pages"""
@ -68,7 +68,7 @@ class PageOps:
                    continue
                if len(pages_buffer) > batch_size:
-                    await self._add_pages_to_db(pages_buffer)
+                    await self._add_pages_to_db(crawl_id, pages_buffer)
                pages_buffer.append(
                    self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
@ -76,7 +76,7 @@ class PageOps:
            # Add any remaining pages in buffer to db
            if pages_buffer:
-                await self._add_pages_to_db(pages_buffer)
+                await self._add_pages_to_db(crawl_id, pages_buffer)
            print(f"Added pages for crawl {crawl_id} to db", flush=True)
        # pylint: disable=broad-exception-caught, raise-missing-from
@ -84,7 +84,9 @@ class PageOps:
            traceback.print_exc()
            print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)
-    def _get_page_from_dict(self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID):
+    def _get_page_from_dict(
        self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID
    ) -> Page:
        """Return Page object from dict"""
        page_id = page_dict.get("id")
        if not page_id:
@ -94,7 +96,7 @@ class PageOps:
        if not status and page_dict.get("loadState"):
            status = 200
-        return Page(
+        p = Page(
            id=page_id,
            oid=oid,
            crawl_id=crawl_id,
@ -109,8 +111,10 @@ class PageOps:
                else datetime.now()
            ),
        )
        p.compute_page_type()
        return p
-    async def _add_pages_to_db(self, pages: List[Page]):
+    async def _add_pages_to_db(self, crawl_id: str, pages: List[Page]):
        """Add batch of pages to db in one insert"""
        result = await self.pages.insert_many(
            [
@ -124,6 +128,8 @@ class PageOps:
            # pylint: disable=broad-exception-raised
            raise Exception("No pages inserted")
        await self.update_crawl_file_and_error_counts(crawl_id, pages)
    async def add_page_to_db(
        self,
        page_dict: Dict[str, Any],
@ -133,12 +139,9 @@ class PageOps:
    ):
        """Add page to database"""
        page = self._get_page_from_dict(page_dict, crawl_id, oid)
        print(f"PAGE: {page}", flush=True)
        page_to_insert = page.to_dict(
            exclude_unset=True, exclude_none=True, exclude_defaults=True
        )
        print(f"PAGE TO INSERT: {page_to_insert}")
        try:
            await self.pages.insert_one(page_to_insert)
@ -153,6 +156,9 @@ class PageOps:
            )
            return
        if not qa_run_id and page:
            await self.update_crawl_file_and_error_counts(crawl_id, [page])
        # qa data
        if qa_run_id and page:
            compare_dict = page_dict.get("comparison")
@ -165,6 +171,39 @@ class PageOps:
            await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare)
    async def update_crawl_file_and_error_counts(
        self, crawl_id: str, pages: List[Page]
    ):
        """Update crawl filePageCount and errorPageCount for pages."""
        file_count = 0
        error_count = 0
        for page in pages:
            if page.isFile:
                file_count += 1
            if page.isError:
                error_count += 1
        if file_count == 0 and error_count == 0:
            return
        inc_query = {}
        if file_count > 0:
            inc_query["filePageCount"] = file_count
        if error_count > 0:
            inc_query["errorPageCount"] = error_count
        await self.crawls.find_one_and_update(
            {
                "_id": crawl_id,
                "type": "crawl",
            },
            {"$inc": inc_query},
        )
    async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None):
        """Delete crawl pages from db"""
        query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id}
@ -501,34 +540,6 @@ class PageOps:
        return [PageOut.from_dict(data) for data in items], total
    async def get_crawl_file_count(self, crawl_id: str):
        """Get count of pages in crawl that are files and don't need to be QAed"""
        aggregate = [
            {
                "$match": {
                    "crawl_id": crawl_id,
                    "loadState": 2,
                    "mime": {"$not": {"$regex": "^.*html", "$options": "i"}},
                }
            },
            {"$count": "count"},
        ]
        cursor = self.pages.aggregate(aggregate)
        results = await cursor.to_list(length=1)
        if not results:
            return 0
        result = results[0]
        try:
            total = int(result["count"])
        except (IndexError, ValueError):
            total = 0
        return total
    async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
        """Delete existing pages for crawl and re-add from WACZs."""
        await self.delete_crawl_pages(crawl_id, oid)
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@ -196,6 +196,18 @@ def test_crawls_exclude_full_seeds(admin_auth_headers, default_org_id, admin_cra
        assert config is None or config.get("seeds") is None
 def test_crawls_include_file_error_page_counts(
    admin_auth_headers, default_org_id, admin_crawl_id
 ):
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
        headers=admin_auth_headers,
    )
    data = r.json()
    assert data["filePageCount"] >= 0
    assert data["errorPageCount"] >= 0
 def test_download_wacz():
    r = requests.get(HOST_PREFIX + wacz_path)
    assert r.status_code == 200
@ -474,6 +486,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
        assert page["loadState"]
        assert page["status"]
        assert page["mime"]
        assert page["isError"] in (True, False)
        assert page["isFile"] in (True, False)
    # Test GET page endpoint
    global page_id
@ -493,6 +507,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
    assert page.get("title") or page.get("title") is None
    assert page["loadState"]
    assert page["mime"]
    assert page["isError"] in (True, False)
    assert page["isFile"] in (True, False)
    assert page["notes"] == []
    assert page.get("userid") is None
@ -591,6 +607,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
    assert page.get("title") or page.get("title") is None
    assert page["loadState"]
    assert page["mime"]
    assert page["isError"] in (True, False)
    assert page["isFile"] in (True, False)
    assert page["notes"] == []
    assert page["userid"]
@ -668,6 +686,8 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
        assert page["loadState"]
        assert page["status"]
        assert page["mime"]
        assert page["isError"] in (True, False)
        assert page["isFile"] in (True, False)
    # Ensure only superuser can re-add pages for all crawls in an org
    r = requests.post(