Backend: Move page file and error counts to crawl replay.json endpoint (#1868)

Backend work for #1859 - Remove file count from qa stats endpoint - Compute isFile or isError per page when page is added - Increment filePageCount and errorPageCount per crawl to count number of isFile or isError pages - Add file and error counts to crawl replay.json endpoint (filePageCount and errorPageCount) - Add migration 0028 to set isFile / isError for each page, aggregate filePageCount / errorPageCount per crawl - Determine if page is a file based on loadState == 2, mime type or status code and lack of title
2024-06-11 12:09:58 -04:00 · 2024-06-11 12:09:58 -04:00 · 879e509b39
commit 879e509b39
parent 16a720c685
5 changed files with 163 additions and 38 deletions
--- a/backend/btrixcloud/db.py
+++ b/backend/btrixcloud/db.py
@ -17,7 +17,7 @@ from pymongo.errors import InvalidName
 from .migrations import BaseMigration


-CURR_DB_VERSION = "0027"
+CURR_DB_VERSION = "0028"


 # ============================================================================
--- a/backend/btrixcloud/migrations/migration_0028_page_files_errors.py
+++ b/backend/btrixcloud/migrations/migration_0028_page_files_errors.py
@ -0,0 +1,71 @@
+"""
+Migration 0028 - Page files and errors
+"""
+
+from btrixcloud.migrations import BaseMigration
+from btrixcloud.models import Page, Crawl
+
+
+MIGRATION_VERSION = "0028"
+
+
+class Migration(BaseMigration):
+    """Migration class."""
+
+    # pylint: disable=unused-argument
+    def __init__(self, mdb, **kwargs):
+        super().__init__(mdb, migration_version=MIGRATION_VERSION)
+
+    async def migrate_up(self):
+        """Perform migration up.
+
+        Update older crawls and their pages:
+        - Add crawl.filePageCount and crawl.errorPageCount
+        - Set Page.isFile and Page.isError
+        """
+        pages_db = self.mdb["pages"]
+        crawls_db = self.mdb["crawls"]
+
+        cursor = crawls_db.find({"type": "crawl", "filePageCount": None})
+        async for crawl_dict in cursor:
+            try:
+                crawl = Crawl.from_dict(crawl_dict)
+                crawl.filePageCount = 0
+                crawl.errorPageCount = 0
+
+                cursor = pages_db.find({"crawl_id": crawl.id})
+                async for page_dict in cursor:
+                    page = Page.from_dict(page_dict)
+
+                    page.compute_page_type()
+                    if page.isFile:
+                        crawl.filePageCount += 1
+
+                    if page.isError:
+                        crawl.errorPageCount += 1
+
+                    if page.isFile or page.isError:
+                        await pages_db.find_one_and_update(
+                            {"_id": page.id},
+                            {
+                                "$set": page.dict(
+                                    include={"isFile": True, "isError": True}
+                                )
+                            },
+                        )
+
+                await crawls_db.find_one_and_update(
+                    {"_id": crawl.id, "type": "crawl"},
+                    {
+                        "$set": crawl.dict(
+                            include={"filePageCount": True, "errorPageCount": True}
+                        )
+                    },
+                )
+            # pylint: disable=broad-exception-caught
+            except Exception as err:
+                crawl_id = crawl_dict.get("_id")
+                print(
+                    f"Error updating page counts and pages for crawl {crawl_id}: {err}",
+                    flush=True,
+                )
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -666,6 +666,9 @@ class CrawlOut(BaseMongoModel):
    lastQAState: Optional[str]
    lastQAStarted: Optional[datetime]

+    filePageCount: Optional[int] = 0
+    errorPageCount: Optional[int] = 0
+

 # ============================================================================
 class CrawlOutWithResources(CrawlOut):
@ -780,6 +783,9 @@ class Crawl(BaseCrawl, CrawlConfigCore):
    qa: Optional[QARun] = None
    qaFinished: Optional[Dict[str, QARun]] = {}

+    filePageCount: Optional[int] = 0
+    errorPageCount: Optional[int] = 0
+

 # ============================================================================
 class CrawlCompleteIn(BaseModel):
@ -1567,6 +1573,23 @@ class Page(BaseMongoModel):
    approved: Optional[bool] = None
    notes: List[PageNote] = []

+    isFile: Optional[bool] = False
+    isError: Optional[bool] = False
+
+    def compute_page_type(self):
+        """sets self.isFile or self.isError flags"""
+        self.isFile = False
+        self.isError = False
+        if self.loadState == 2:
+            # pylint: disable=unsupported-membership-test
+            if self.mime and "html" not in self.mime:
+                self.isFile = True
+            elif self.title is None and self.status == 200:
+                self.isFile = True
+
+        elif self.loadState == 0:
+            self.isError = True
+

 # ============================================================================
 class PageWithAllQA(Page):
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@ -36,7 +36,7 @@ else:


 # ============================================================================
-# pylint: disable=too-many-instance-attributes, too-many-arguments
+# pylint: disable=too-many-instance-attributes, too-many-arguments,too-many-public-methods
 class PageOps:
    """crawl pages"""

@ -68,7 +68,7 @@ class PageOps:
                    continue

                if len(pages_buffer) > batch_size:
-                    await self._add_pages_to_db(pages_buffer)
+                    await self._add_pages_to_db(crawl_id, pages_buffer)

                pages_buffer.append(
                    self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
@ -76,7 +76,7 @@ class PageOps:

            # Add any remaining pages in buffer to db
            if pages_buffer:
-                await self._add_pages_to_db(pages_buffer)
+                await self._add_pages_to_db(crawl_id, pages_buffer)

            print(f"Added pages for crawl {crawl_id} to db", flush=True)
        # pylint: disable=broad-exception-caught, raise-missing-from
@ -84,7 +84,9 @@ class PageOps:
            traceback.print_exc()
            print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)

-    def _get_page_from_dict(self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID):
+    def _get_page_from_dict(
+        self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID
+    ) -> Page:
        """Return Page object from dict"""
        page_id = page_dict.get("id")
        if not page_id:
@ -94,7 +96,7 @@ class PageOps:
        if not status and page_dict.get("loadState"):
            status = 200

-        return Page(
+        p = Page(
            id=page_id,
            oid=oid,
            crawl_id=crawl_id,
@ -109,8 +111,10 @@ class PageOps:
                else datetime.now()
            ),
        )
+        p.compute_page_type()
+        return p

-    async def _add_pages_to_db(self, pages: List[Page]):
+    async def _add_pages_to_db(self, crawl_id: str, pages: List[Page]):
        """Add batch of pages to db in one insert"""
        result = await self.pages.insert_many(
            [
@ -124,6 +128,8 @@ class PageOps:
            # pylint: disable=broad-exception-raised
            raise Exception("No pages inserted")

+        await self.update_crawl_file_and_error_counts(crawl_id, pages)
+
    async def add_page_to_db(
        self,
        page_dict: Dict[str, Any],
@ -133,12 +139,9 @@ class PageOps:
    ):
        """Add page to database"""
        page = self._get_page_from_dict(page_dict, crawl_id, oid)
-        print(f"PAGE: {page}", flush=True)
-
        page_to_insert = page.to_dict(
            exclude_unset=True, exclude_none=True, exclude_defaults=True
        )
-        print(f"PAGE TO INSERT: {page_to_insert}")

        try:
            await self.pages.insert_one(page_to_insert)
@ -153,6 +156,9 @@ class PageOps:
            )
            return

+        if not qa_run_id and page:
+            await self.update_crawl_file_and_error_counts(crawl_id, [page])
+
        # qa data
        if qa_run_id and page:
            compare_dict = page_dict.get("comparison")
@ -165,6 +171,39 @@ class PageOps:

            await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare)

+    async def update_crawl_file_and_error_counts(
+        self, crawl_id: str, pages: List[Page]
+    ):
+        """Update crawl filePageCount and errorPageCount for pages."""
+        file_count = 0
+        error_count = 0
+
+        for page in pages:
+            if page.isFile:
+                file_count += 1
+
+            if page.isError:
+                error_count += 1
+
+        if file_count == 0 and error_count == 0:
+            return
+
+        inc_query = {}
+
+        if file_count > 0:
+            inc_query["filePageCount"] = file_count
+
+        if error_count > 0:
+            inc_query["errorPageCount"] = error_count
+
+        await self.crawls.find_one_and_update(
+            {
+                "_id": crawl_id,
+                "type": "crawl",
+            },
+            {"$inc": inc_query},
+        )
+
    async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None):
        """Delete crawl pages from db"""
        query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id}
@ -501,34 +540,6 @@ class PageOps:

        return [PageOut.from_dict(data) for data in items], total

-    async def get_crawl_file_count(self, crawl_id: str):
-        """Get count of pages in crawl that are files and don't need to be QAed"""
-        aggregate = [
-            {
-                "$match": {
-                    "crawl_id": crawl_id,
-                    "loadState": 2,
-                    "mime": {"$not": {"$regex": "^.*html", "$options": "i"}},
-                }
-            },
-            {"$count": "count"},
-        ]
-
-        cursor = self.pages.aggregate(aggregate)
-        results = await cursor.to_list(length=1)
-
-        if not results:
-            return 0
-
-        result = results[0]
-
-        try:
-            total = int(result["count"])
-        except (IndexError, ValueError):
-            total = 0
-
-        return total
-
    async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
        """Delete existing pages for crawl and re-add from WACZs."""
        await self.delete_crawl_pages(crawl_id, oid)
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@ -196,6 +196,18 @@ def test_crawls_exclude_full_seeds(admin_auth_headers, default_org_id, admin_cra
        assert config is None or config.get("seeds") is None


+def test_crawls_include_file_error_page_counts(
+    admin_auth_headers, default_org_id, admin_crawl_id
+):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    assert data["filePageCount"] >= 0
+    assert data["errorPageCount"] >= 0
+
+
 def test_download_wacz():
    r = requests.get(HOST_PREFIX + wacz_path)
    assert r.status_code == 200
@ -474,6 +486,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
        assert page["loadState"]
        assert page["status"]
        assert page["mime"]
+        assert page["isError"] in (True, False)
+        assert page["isFile"] in (True, False)

    # Test GET page endpoint
    global page_id
@ -493,6 +507,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
    assert page.get("title") or page.get("title") is None
    assert page["loadState"]
    assert page["mime"]
+    assert page["isError"] in (True, False)
+    assert page["isFile"] in (True, False)

    assert page["notes"] == []
    assert page.get("userid") is None
@ -591,6 +607,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
    assert page.get("title") or page.get("title") is None
    assert page["loadState"]
    assert page["mime"]
+    assert page["isError"] in (True, False)
+    assert page["isFile"] in (True, False)

    assert page["notes"] == []
    assert page["userid"]
@ -668,6 +686,8 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
        assert page["loadState"]
        assert page["status"]
        assert page["mime"]
+        assert page["isError"] in (True, False)
+        assert page["isFile"] in (True, False)

    # Ensure only superuser can re-add pages for all crawls in an org
    r = requests.post(