Ensure crawl page counts are correct when re-adding pages (#2601)
Fixes #2600 This PR fixes the issue by ensuring that crawl page counts (total, unique, files, errors) are reset to 0 when crawl pages are deleted, such as right before being re-added. It also adds a migration will recalculates file and error page counts for each crawl without re-adding pages from the WACZ files.
This commit is contained in:
parent
594f5bc171
commit
6f81d588a9
@ -32,7 +32,7 @@ else:
|
||||
) = PageOps = BackgroundJobOps = object
|
||||
|
||||
|
||||
CURR_DB_VERSION = "0044"
|
||||
CURR_DB_VERSION = "0045"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
60
backend/btrixcloud/migrations/migration_0045_crawl_counts.py
Normal file
60
backend/btrixcloud/migrations/migration_0045_crawl_counts.py
Normal file
@ -0,0 +1,60 @@
|
||||
"""
|
||||
Migration 0045 - Recalculate crawl filePageCount and errorPageCount
|
||||
"""
|
||||
|
||||
from btrixcloud.migrations import BaseMigration
|
||||
|
||||
|
||||
MIGRATION_VERSION = "0045"
|
||||
|
||||
|
||||
# pylint: disable=duplicate-code
|
||||
class Migration(BaseMigration):
|
||||
"""Migration class."""
|
||||
|
||||
# pylint: disable=unused-argument
|
||||
def __init__(self, mdb, **kwargs):
|
||||
super().__init__(mdb, migration_version=MIGRATION_VERSION)
|
||||
|
||||
self.page_ops = kwargs.get("page_ops")
|
||||
|
||||
async def migrate_up(self):
|
||||
"""Perform migration up.
|
||||
|
||||
Recalculate crawl filePageCount and errorPageCount for all crawls
|
||||
"""
|
||||
crawls_mdb = self.mdb["crawls"]
|
||||
|
||||
if self.page_ops is None:
|
||||
print(
|
||||
"Unable to reset crawl page counts, missing page_ops",
|
||||
flush=True,
|
||||
)
|
||||
return
|
||||
|
||||
match_query = {
|
||||
"$or": [{"errorPageCount": {"$gt": 0}}, {"filePageCount": {"$gt": 0}}]
|
||||
}
|
||||
async for crawl_raw in crawls_mdb.find(match_query, projection=["_id"]):
|
||||
crawl_id = crawl_raw["_id"]
|
||||
|
||||
try:
|
||||
# Reset filePageCount and errorPageCount to 0
|
||||
await crawls_mdb.find_one_and_update(
|
||||
{"_id": crawl_id},
|
||||
{
|
||||
"$set": {
|
||||
"filePageCount": 0,
|
||||
"errorPageCount": 0,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
# Re-increment filePageCount and errorPageCount
|
||||
await self.page_ops.update_crawl_file_and_error_counts(crawl_id)
|
||||
# pylint: disable=broad-exception-caught
|
||||
except Exception as err:
|
||||
print(
|
||||
f"Unable to update page counts for crawl {crawl_id}: {err}",
|
||||
flush=True,
|
||||
)
|
@ -246,18 +246,25 @@ class PageOps:
|
||||
await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare)
|
||||
|
||||
async def update_crawl_file_and_error_counts(
|
||||
self, crawl_id: str, pages: List[Page]
|
||||
self, crawl_id: str, pages: Optional[List[Page]] = None
|
||||
):
|
||||
"""Update crawl filePageCount and errorPageCount for pages."""
|
||||
file_count = 0
|
||||
error_count = 0
|
||||
|
||||
for page in pages:
|
||||
if page.isFile:
|
||||
file_count += 1
|
||||
|
||||
if page.isError:
|
||||
error_count += 1
|
||||
if pages is not None:
|
||||
for page in pages:
|
||||
if page.isFile:
|
||||
file_count += 1
|
||||
if page.isError:
|
||||
error_count += 1
|
||||
else:
|
||||
# If page list not supplied, count all pages in crawl
|
||||
async for page_raw in self.pages.find({"crawl_id": crawl_id}):
|
||||
if page_raw.get("isFile"):
|
||||
file_count += 1
|
||||
if page_raw.get("isError"):
|
||||
error_count += 1
|
||||
|
||||
if file_count == 0 and error_count == 0:
|
||||
return
|
||||
@ -276,7 +283,7 @@ class PageOps:
|
||||
)
|
||||
|
||||
async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None):
|
||||
"""Delete crawl pages from db"""
|
||||
"""Delete crawl pages from db and clear crawl page counts"""
|
||||
query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id}
|
||||
if oid:
|
||||
query["oid"] = oid
|
||||
@ -289,6 +296,25 @@ class PageOps:
|
||||
flush=True,
|
||||
)
|
||||
|
||||
try:
|
||||
await self.crawls.find_one_and_update(
|
||||
{"_id": crawl_id},
|
||||
{
|
||||
"$set": {
|
||||
"pageCount": 0,
|
||||
"uniquePageCount": 0,
|
||||
"filePageCount": 0,
|
||||
"errorPageCount": 0,
|
||||
}
|
||||
},
|
||||
)
|
||||
# pylint: disable=broad-except
|
||||
except Exception as err:
|
||||
print(
|
||||
f"Error resetting page counts for crawl {crawl_id}: {err}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
async def get_page_raw(
|
||||
self,
|
||||
page_id: UUID,
|
||||
|
@ -956,6 +956,19 @@ def test_crawl_pages_qa_filters(crawler_auth_headers, default_org_id, crawler_cr
|
||||
|
||||
|
||||
def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
# Store page counts to compare against after re-adding
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
page_count_before = data["pageCount"]
|
||||
page_count_before_unique = data["uniquePageCount"]
|
||||
page_count_before_files = data["filePageCount"]
|
||||
page_count_before_errors = data["errorPageCount"]
|
||||
|
||||
# Re-add pages and verify they were correctly added
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/reAdd",
|
||||
@ -1001,15 +1014,20 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
|
||||
)
|
||||
assert r.status_code == 403
|
||||
|
||||
# Check that pageCount and uniquePageCount were stored on crawl
|
||||
# Check that crawl page counts were recalculated properly
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["pageCount"] > 0
|
||||
assert data["uniquePageCount"] > 0
|
||||
assert data["pageCount"] > 0 and data["pageCount"] == page_count_before
|
||||
assert (
|
||||
data["uniquePageCount"] > 0
|
||||
and data["uniquePageCount"] == page_count_before_unique
|
||||
)
|
||||
assert data["filePageCount"] == page_count_before_files
|
||||
assert data["errorPageCount"] == page_count_before_errors
|
||||
|
||||
|
||||
def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
|
Loading…
Reference in New Issue
Block a user