Ensure crawl page counts are correct when re-adding pages (#2601)
Fixes #2600 This PR fixes the issue by ensuring that crawl page counts (total, unique, files, errors) are reset to 0 when crawl pages are deleted, such as right before being re-added. It also adds a migration will recalculates file and error page counts for each crawl without re-adding pages from the WACZ files.
This commit is contained in:
parent
594f5bc171
commit
6f81d588a9
@ -32,7 +32,7 @@ else:
|
|||||||
) = PageOps = BackgroundJobOps = object
|
) = PageOps = BackgroundJobOps = object
|
||||||
|
|
||||||
|
|
||||||
CURR_DB_VERSION = "0044"
|
CURR_DB_VERSION = "0045"
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
60
backend/btrixcloud/migrations/migration_0045_crawl_counts.py
Normal file
60
backend/btrixcloud/migrations/migration_0045_crawl_counts.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
"""
|
||||||
|
Migration 0045 - Recalculate crawl filePageCount and errorPageCount
|
||||||
|
"""
|
||||||
|
|
||||||
|
from btrixcloud.migrations import BaseMigration
|
||||||
|
|
||||||
|
|
||||||
|
MIGRATION_VERSION = "0045"
|
||||||
|
|
||||||
|
|
||||||
|
# pylint: disable=duplicate-code
|
||||||
|
class Migration(BaseMigration):
|
||||||
|
"""Migration class."""
|
||||||
|
|
||||||
|
# pylint: disable=unused-argument
|
||||||
|
def __init__(self, mdb, **kwargs):
|
||||||
|
super().__init__(mdb, migration_version=MIGRATION_VERSION)
|
||||||
|
|
||||||
|
self.page_ops = kwargs.get("page_ops")
|
||||||
|
|
||||||
|
async def migrate_up(self):
|
||||||
|
"""Perform migration up.
|
||||||
|
|
||||||
|
Recalculate crawl filePageCount and errorPageCount for all crawls
|
||||||
|
"""
|
||||||
|
crawls_mdb = self.mdb["crawls"]
|
||||||
|
|
||||||
|
if self.page_ops is None:
|
||||||
|
print(
|
||||||
|
"Unable to reset crawl page counts, missing page_ops",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
match_query = {
|
||||||
|
"$or": [{"errorPageCount": {"$gt": 0}}, {"filePageCount": {"$gt": 0}}]
|
||||||
|
}
|
||||||
|
async for crawl_raw in crawls_mdb.find(match_query, projection=["_id"]):
|
||||||
|
crawl_id = crawl_raw["_id"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Reset filePageCount and errorPageCount to 0
|
||||||
|
await crawls_mdb.find_one_and_update(
|
||||||
|
{"_id": crawl_id},
|
||||||
|
{
|
||||||
|
"$set": {
|
||||||
|
"filePageCount": 0,
|
||||||
|
"errorPageCount": 0,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Re-increment filePageCount and errorPageCount
|
||||||
|
await self.page_ops.update_crawl_file_and_error_counts(crawl_id)
|
||||||
|
# pylint: disable=broad-exception-caught
|
||||||
|
except Exception as err:
|
||||||
|
print(
|
||||||
|
f"Unable to update page counts for crawl {crawl_id}: {err}",
|
||||||
|
flush=True,
|
||||||
|
)
|
@ -246,18 +246,25 @@ class PageOps:
|
|||||||
await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare)
|
await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare)
|
||||||
|
|
||||||
async def update_crawl_file_and_error_counts(
|
async def update_crawl_file_and_error_counts(
|
||||||
self, crawl_id: str, pages: List[Page]
|
self, crawl_id: str, pages: Optional[List[Page]] = None
|
||||||
):
|
):
|
||||||
"""Update crawl filePageCount and errorPageCount for pages."""
|
"""Update crawl filePageCount and errorPageCount for pages."""
|
||||||
file_count = 0
|
file_count = 0
|
||||||
error_count = 0
|
error_count = 0
|
||||||
|
|
||||||
|
if pages is not None:
|
||||||
for page in pages:
|
for page in pages:
|
||||||
if page.isFile:
|
if page.isFile:
|
||||||
file_count += 1
|
file_count += 1
|
||||||
|
|
||||||
if page.isError:
|
if page.isError:
|
||||||
error_count += 1
|
error_count += 1
|
||||||
|
else:
|
||||||
|
# If page list not supplied, count all pages in crawl
|
||||||
|
async for page_raw in self.pages.find({"crawl_id": crawl_id}):
|
||||||
|
if page_raw.get("isFile"):
|
||||||
|
file_count += 1
|
||||||
|
if page_raw.get("isError"):
|
||||||
|
error_count += 1
|
||||||
|
|
||||||
if file_count == 0 and error_count == 0:
|
if file_count == 0 and error_count == 0:
|
||||||
return
|
return
|
||||||
@ -276,7 +283,7 @@ class PageOps:
|
|||||||
)
|
)
|
||||||
|
|
||||||
async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None):
|
async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None):
|
||||||
"""Delete crawl pages from db"""
|
"""Delete crawl pages from db and clear crawl page counts"""
|
||||||
query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id}
|
query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id}
|
||||||
if oid:
|
if oid:
|
||||||
query["oid"] = oid
|
query["oid"] = oid
|
||||||
@ -289,6 +296,25 @@ class PageOps:
|
|||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await self.crawls.find_one_and_update(
|
||||||
|
{"_id": crawl_id},
|
||||||
|
{
|
||||||
|
"$set": {
|
||||||
|
"pageCount": 0,
|
||||||
|
"uniquePageCount": 0,
|
||||||
|
"filePageCount": 0,
|
||||||
|
"errorPageCount": 0,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
# pylint: disable=broad-except
|
||||||
|
except Exception as err:
|
||||||
|
print(
|
||||||
|
f"Error resetting page counts for crawl {crawl_id}: {err}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
async def get_page_raw(
|
async def get_page_raw(
|
||||||
self,
|
self,
|
||||||
page_id: UUID,
|
page_id: UUID,
|
||||||
|
@ -956,6 +956,19 @@ def test_crawl_pages_qa_filters(crawler_auth_headers, default_org_id, crawler_cr
|
|||||||
|
|
||||||
|
|
||||||
def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||||
|
# Store page counts to compare against after re-adding
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
|
||||||
|
page_count_before = data["pageCount"]
|
||||||
|
page_count_before_unique = data["uniquePageCount"]
|
||||||
|
page_count_before_files = data["filePageCount"]
|
||||||
|
page_count_before_errors = data["errorPageCount"]
|
||||||
|
|
||||||
# Re-add pages and verify they were correctly added
|
# Re-add pages and verify they were correctly added
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/reAdd",
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/reAdd",
|
||||||
@ -1001,15 +1014,20 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
|
|||||||
)
|
)
|
||||||
assert r.status_code == 403
|
assert r.status_code == 403
|
||||||
|
|
||||||
# Check that pageCount and uniquePageCount were stored on crawl
|
# Check that crawl page counts were recalculated properly
|
||||||
r = requests.get(
|
r = requests.get(
|
||||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
|
||||||
headers=crawler_auth_headers,
|
headers=crawler_auth_headers,
|
||||||
)
|
)
|
||||||
assert r.status_code == 200
|
assert r.status_code == 200
|
||||||
data = r.json()
|
data = r.json()
|
||||||
assert data["pageCount"] > 0
|
assert data["pageCount"] > 0 and data["pageCount"] == page_count_before
|
||||||
assert data["uniquePageCount"] > 0
|
assert (
|
||||||
|
data["uniquePageCount"] > 0
|
||||||
|
and data["uniquePageCount"] == page_count_before_unique
|
||||||
|
)
|
||||||
|
assert data["filePageCount"] == page_count_before_files
|
||||||
|
assert data["errorPageCount"] == page_count_before_errors
|
||||||
|
|
||||||
|
|
||||||
def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||||
|
Loading…
Reference in New Issue
Block a user