diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index c874bebf..fbbbea40 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -17,7 +17,7 @@ from pymongo.errors import InvalidName from .migrations import BaseMigration -CURR_DB_VERSION = "0027" +CURR_DB_VERSION = "0028" # ============================================================================ diff --git a/backend/btrixcloud/migrations/migration_0028_page_files_errors.py b/backend/btrixcloud/migrations/migration_0028_page_files_errors.py new file mode 100644 index 00000000..05bc6ee1 --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0028_page_files_errors.py @@ -0,0 +1,71 @@ +""" +Migration 0028 - Page files and errors +""" + +from btrixcloud.migrations import BaseMigration +from btrixcloud.models import Page, Crawl + + +MIGRATION_VERSION = "0028" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + async def migrate_up(self): + """Perform migration up. + + Update older crawls and their pages: + - Add crawl.filePageCount and crawl.errorPageCount + - Set Page.isFile and Page.isError + """ + pages_db = self.mdb["pages"] + crawls_db = self.mdb["crawls"] + + cursor = crawls_db.find({"type": "crawl", "filePageCount": None}) + async for crawl_dict in cursor: + try: + crawl = Crawl.from_dict(crawl_dict) + crawl.filePageCount = 0 + crawl.errorPageCount = 0 + + cursor = pages_db.find({"crawl_id": crawl.id}) + async for page_dict in cursor: + page = Page.from_dict(page_dict) + + page.compute_page_type() + if page.isFile: + crawl.filePageCount += 1 + + if page.isError: + crawl.errorPageCount += 1 + + if page.isFile or page.isError: + await pages_db.find_one_and_update( + {"_id": page.id}, + { + "$set": page.dict( + include={"isFile": True, "isError": True} + ) + }, + ) + + await crawls_db.find_one_and_update( + {"_id": crawl.id, "type": "crawl"}, + { + "$set": crawl.dict( + include={"filePageCount": True, "errorPageCount": True} + ) + }, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + crawl_id = crawl_dict.get("_id") + print( + f"Error updating page counts and pages for crawl {crawl_id}: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 5553c47c..87e3ffa4 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -666,6 +666,9 @@ class CrawlOut(BaseMongoModel): lastQAState: Optional[str] lastQAStarted: Optional[datetime] + filePageCount: Optional[int] = 0 + errorPageCount: Optional[int] = 0 + # ============================================================================ class CrawlOutWithResources(CrawlOut): @@ -780,6 +783,9 @@ class Crawl(BaseCrawl, CrawlConfigCore): qa: Optional[QARun] = None qaFinished: Optional[Dict[str, QARun]] = {} + filePageCount: Optional[int] = 0 + errorPageCount: Optional[int] = 0 + # ============================================================================ class CrawlCompleteIn(BaseModel): @@ -1567,6 +1573,23 @@ class Page(BaseMongoModel): approved: Optional[bool] = None notes: List[PageNote] = [] + isFile: Optional[bool] = False + isError: Optional[bool] = False + + def compute_page_type(self): + """sets self.isFile or self.isError flags""" + self.isFile = False + self.isError = False + if self.loadState == 2: + # pylint: disable=unsupported-membership-test + if self.mime and "html" not in self.mime: + self.isFile = True + elif self.title is None and self.status == 200: + self.isFile = True + + elif self.loadState == 0: + self.isError = True + # ============================================================================ class PageWithAllQA(Page): diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index b549ee47..414a898a 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -36,7 +36,7 @@ else: # ============================================================================ -# pylint: disable=too-many-instance-attributes, too-many-arguments +# pylint: disable=too-many-instance-attributes, too-many-arguments,too-many-public-methods class PageOps: """crawl pages""" @@ -68,7 +68,7 @@ class PageOps: continue if len(pages_buffer) > batch_size: - await self._add_pages_to_db(pages_buffer) + await self._add_pages_to_db(crawl_id, pages_buffer) pages_buffer.append( self._get_page_from_dict(page_dict, crawl_id, crawl.oid) @@ -76,7 +76,7 @@ class PageOps: # Add any remaining pages in buffer to db if pages_buffer: - await self._add_pages_to_db(pages_buffer) + await self._add_pages_to_db(crawl_id, pages_buffer) print(f"Added pages for crawl {crawl_id} to db", flush=True) # pylint: disable=broad-exception-caught, raise-missing-from @@ -84,7 +84,9 @@ class PageOps: traceback.print_exc() print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True) - def _get_page_from_dict(self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID): + def _get_page_from_dict( + self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID + ) -> Page: """Return Page object from dict""" page_id = page_dict.get("id") if not page_id: @@ -94,7 +96,7 @@ class PageOps: if not status and page_dict.get("loadState"): status = 200 - return Page( + p = Page( id=page_id, oid=oid, crawl_id=crawl_id, @@ -109,8 +111,10 @@ class PageOps: else datetime.now() ), ) + p.compute_page_type() + return p - async def _add_pages_to_db(self, pages: List[Page]): + async def _add_pages_to_db(self, crawl_id: str, pages: List[Page]): """Add batch of pages to db in one insert""" result = await self.pages.insert_many( [ @@ -124,6 +128,8 @@ class PageOps: # pylint: disable=broad-exception-raised raise Exception("No pages inserted") + await self.update_crawl_file_and_error_counts(crawl_id, pages) + async def add_page_to_db( self, page_dict: Dict[str, Any], @@ -133,12 +139,9 @@ class PageOps: ): """Add page to database""" page = self._get_page_from_dict(page_dict, crawl_id, oid) - print(f"PAGE: {page}", flush=True) - page_to_insert = page.to_dict( exclude_unset=True, exclude_none=True, exclude_defaults=True ) - print(f"PAGE TO INSERT: {page_to_insert}") try: await self.pages.insert_one(page_to_insert) @@ -153,6 +156,9 @@ class PageOps: ) return + if not qa_run_id and page: + await self.update_crawl_file_and_error_counts(crawl_id, [page]) + # qa data if qa_run_id and page: compare_dict = page_dict.get("comparison") @@ -165,6 +171,39 @@ class PageOps: await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare) + async def update_crawl_file_and_error_counts( + self, crawl_id: str, pages: List[Page] + ): + """Update crawl filePageCount and errorPageCount for pages.""" + file_count = 0 + error_count = 0 + + for page in pages: + if page.isFile: + file_count += 1 + + if page.isError: + error_count += 1 + + if file_count == 0 and error_count == 0: + return + + inc_query = {} + + if file_count > 0: + inc_query["filePageCount"] = file_count + + if error_count > 0: + inc_query["errorPageCount"] = error_count + + await self.crawls.find_one_and_update( + { + "_id": crawl_id, + "type": "crawl", + }, + {"$inc": inc_query}, + ) + async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None): """Delete crawl pages from db""" query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id} @@ -501,34 +540,6 @@ class PageOps: return [PageOut.from_dict(data) for data in items], total - async def get_crawl_file_count(self, crawl_id: str): - """Get count of pages in crawl that are files and don't need to be QAed""" - aggregate = [ - { - "$match": { - "crawl_id": crawl_id, - "loadState": 2, - "mime": {"$not": {"$regex": "^.*html", "$options": "i"}}, - } - }, - {"$count": "count"}, - ] - - cursor = self.pages.aggregate(aggregate) - results = await cursor.to_list(length=1) - - if not results: - return 0 - - result = results[0] - - try: - total = int(result["count"]) - except (IndexError, ValueError): - total = 0 - - return total - async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): """Delete existing pages for crawl and re-add from WACZs.""" await self.delete_crawl_pages(crawl_id, oid) diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index bbbe7fee..5aa40e54 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -196,6 +196,18 @@ def test_crawls_exclude_full_seeds(admin_auth_headers, default_org_id, admin_cra assert config is None or config.get("seeds") is None +def test_crawls_include_file_error_page_counts( + admin_auth_headers, default_org_id, admin_crawl_id +): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json", + headers=admin_auth_headers, + ) + data = r.json() + assert data["filePageCount"] >= 0 + assert data["errorPageCount"] >= 0 + + def test_download_wacz(): r = requests.get(HOST_PREFIX + wacz_path) assert r.status_code == 200 @@ -474,6 +486,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["loadState"] assert page["status"] assert page["mime"] + assert page["isError"] in (True, False) + assert page["isFile"] in (True, False) # Test GET page endpoint global page_id @@ -493,6 +507,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page.get("title") or page.get("title") is None assert page["loadState"] assert page["mime"] + assert page["isError"] in (True, False) + assert page["isFile"] in (True, False) assert page["notes"] == [] assert page.get("userid") is None @@ -591,6 +607,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page.get("title") or page.get("title") is None assert page["loadState"] assert page["mime"] + assert page["isError"] in (True, False) + assert page["isFile"] in (True, False) assert page["notes"] == [] assert page["userid"] @@ -668,6 +686,8 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_ assert page["loadState"] assert page["status"] assert page["mime"] + assert page["isError"] in (True, False) + assert page["isFile"] in (True, False) # Ensure only superuser can re-add pages for all crawls in an org r = requests.post(