diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index 510cef49..471de598 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -32,7 +32,7 @@ else: ) = PageOps = BackgroundJobOps = object -CURR_DB_VERSION = "0044" +CURR_DB_VERSION = "0045" # ============================================================================ diff --git a/backend/btrixcloud/migrations/migration_0045_crawl_counts.py b/backend/btrixcloud/migrations/migration_0045_crawl_counts.py new file mode 100644 index 00000000..a8bc70f9 --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0045_crawl_counts.py @@ -0,0 +1,60 @@ +""" +Migration 0045 - Recalculate crawl filePageCount and errorPageCount +""" + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0045" + + +# pylint: disable=duplicate-code +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + self.page_ops = kwargs.get("page_ops") + + async def migrate_up(self): + """Perform migration up. + + Recalculate crawl filePageCount and errorPageCount for all crawls + """ + crawls_mdb = self.mdb["crawls"] + + if self.page_ops is None: + print( + "Unable to reset crawl page counts, missing page_ops", + flush=True, + ) + return + + match_query = { + "$or": [{"errorPageCount": {"$gt": 0}}, {"filePageCount": {"$gt": 0}}] + } + async for crawl_raw in crawls_mdb.find(match_query, projection=["_id"]): + crawl_id = crawl_raw["_id"] + + try: + # Reset filePageCount and errorPageCount to 0 + await crawls_mdb.find_one_and_update( + {"_id": crawl_id}, + { + "$set": { + "filePageCount": 0, + "errorPageCount": 0, + } + }, + ) + + # Re-increment filePageCount and errorPageCount + await self.page_ops.update_crawl_file_and_error_counts(crawl_id) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Unable to update page counts for crawl {crawl_id}: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 07a8fda0..be0ba40a 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -246,18 +246,25 @@ class PageOps: await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare) async def update_crawl_file_and_error_counts( - self, crawl_id: str, pages: List[Page] + self, crawl_id: str, pages: Optional[List[Page]] = None ): """Update crawl filePageCount and errorPageCount for pages.""" file_count = 0 error_count = 0 - for page in pages: - if page.isFile: - file_count += 1 - - if page.isError: - error_count += 1 + if pages is not None: + for page in pages: + if page.isFile: + file_count += 1 + if page.isError: + error_count += 1 + else: + # If page list not supplied, count all pages in crawl + async for page_raw in self.pages.find({"crawl_id": crawl_id}): + if page_raw.get("isFile"): + file_count += 1 + if page_raw.get("isError"): + error_count += 1 if file_count == 0 and error_count == 0: return @@ -276,7 +283,7 @@ class PageOps: ) async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None): - """Delete crawl pages from db""" + """Delete crawl pages from db and clear crawl page counts""" query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id} if oid: query["oid"] = oid @@ -289,6 +296,25 @@ class PageOps: flush=True, ) + try: + await self.crawls.find_one_and_update( + {"_id": crawl_id}, + { + "$set": { + "pageCount": 0, + "uniquePageCount": 0, + "filePageCount": 0, + "errorPageCount": 0, + } + }, + ) + # pylint: disable=broad-except + except Exception as err: + print( + f"Error resetting page counts for crawl {crawl_id}: {err}", + flush=True, + ) + async def get_page_raw( self, page_id: UUID, diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index bcb9b18e..33eb7851 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -956,6 +956,19 @@ def test_crawl_pages_qa_filters(crawler_auth_headers, default_org_id, crawler_cr def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): + # Store page counts to compare against after re-adding + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + page_count_before = data["pageCount"] + page_count_before_unique = data["uniquePageCount"] + page_count_before_files = data["filePageCount"] + page_count_before_errors = data["errorPageCount"] + # Re-add pages and verify they were correctly added r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/reAdd", @@ -1001,15 +1014,20 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_ ) assert r.status_code == 403 - # Check that pageCount and uniquePageCount were stored on crawl + # Check that crawl page counts were recalculated properly r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}", headers=crawler_auth_headers, ) assert r.status_code == 200 data = r.json() - assert data["pageCount"] > 0 - assert data["uniquePageCount"] > 0 + assert data["pageCount"] > 0 and data["pageCount"] == page_count_before + assert ( + data["uniquePageCount"] > 0 + and data["uniquePageCount"] == page_count_before_unique + ) + assert data["filePageCount"] == page_count_before_files + assert data["errorPageCount"] == page_count_before_errors def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id):