Backend: Move page file and error counts to crawl replay.json endpoint (#1868)

Backend work for #1859

- Remove file count from qa stats endpoint
- Compute isFile or isError per page when page is added
- Increment filePageCount and errorPageCount per crawl to count number of isFile or isError pages
- Add file and error counts to crawl replay.json endpoint (filePageCount and errorPageCount)
- Add migration 0028 to set isFile / isError for each page, aggregate filePageCount / errorPageCount per crawl
- Determine if page is a file based on loadState == 2, mime type or status code and lack of title
This commit is contained in:
Tessa Walsh 2024-06-11 12:09:58 -04:00 committed by Ilya Kreymer
parent 16a720c685
commit 879e509b39
5 changed files with 163 additions and 38 deletions

View File

@ -17,7 +17,7 @@ from pymongo.errors import InvalidName
from .migrations import BaseMigration
CURR_DB_VERSION = "0027"
CURR_DB_VERSION = "0028"
# ============================================================================

View File

@ -0,0 +1,71 @@
"""
Migration 0028 - Page files and errors
"""
from btrixcloud.migrations import BaseMigration
from btrixcloud.models import Page, Crawl
MIGRATION_VERSION = "0028"
class Migration(BaseMigration):
"""Migration class."""
# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
Update older crawls and their pages:
- Add crawl.filePageCount and crawl.errorPageCount
- Set Page.isFile and Page.isError
"""
pages_db = self.mdb["pages"]
crawls_db = self.mdb["crawls"]
cursor = crawls_db.find({"type": "crawl", "filePageCount": None})
async for crawl_dict in cursor:
try:
crawl = Crawl.from_dict(crawl_dict)
crawl.filePageCount = 0
crawl.errorPageCount = 0
cursor = pages_db.find({"crawl_id": crawl.id})
async for page_dict in cursor:
page = Page.from_dict(page_dict)
page.compute_page_type()
if page.isFile:
crawl.filePageCount += 1
if page.isError:
crawl.errorPageCount += 1
if page.isFile or page.isError:
await pages_db.find_one_and_update(
{"_id": page.id},
{
"$set": page.dict(
include={"isFile": True, "isError": True}
)
},
)
await crawls_db.find_one_and_update(
{"_id": crawl.id, "type": "crawl"},
{
"$set": crawl.dict(
include={"filePageCount": True, "errorPageCount": True}
)
},
)
# pylint: disable=broad-exception-caught
except Exception as err:
crawl_id = crawl_dict.get("_id")
print(
f"Error updating page counts and pages for crawl {crawl_id}: {err}",
flush=True,
)

View File

@ -666,6 +666,9 @@ class CrawlOut(BaseMongoModel):
lastQAState: Optional[str]
lastQAStarted: Optional[datetime]
filePageCount: Optional[int] = 0
errorPageCount: Optional[int] = 0
# ============================================================================
class CrawlOutWithResources(CrawlOut):
@ -780,6 +783,9 @@ class Crawl(BaseCrawl, CrawlConfigCore):
qa: Optional[QARun] = None
qaFinished: Optional[Dict[str, QARun]] = {}
filePageCount: Optional[int] = 0
errorPageCount: Optional[int] = 0
# ============================================================================
class CrawlCompleteIn(BaseModel):
@ -1567,6 +1573,23 @@ class Page(BaseMongoModel):
approved: Optional[bool] = None
notes: List[PageNote] = []
isFile: Optional[bool] = False
isError: Optional[bool] = False
def compute_page_type(self):
"""sets self.isFile or self.isError flags"""
self.isFile = False
self.isError = False
if self.loadState == 2:
# pylint: disable=unsupported-membership-test
if self.mime and "html" not in self.mime:
self.isFile = True
elif self.title is None and self.status == 200:
self.isFile = True
elif self.loadState == 0:
self.isError = True
# ============================================================================
class PageWithAllQA(Page):

View File

@ -36,7 +36,7 @@ else:
# ============================================================================
# pylint: disable=too-many-instance-attributes, too-many-arguments
# pylint: disable=too-many-instance-attributes, too-many-arguments,too-many-public-methods
class PageOps:
"""crawl pages"""
@ -68,7 +68,7 @@ class PageOps:
continue
if len(pages_buffer) > batch_size:
await self._add_pages_to_db(pages_buffer)
await self._add_pages_to_db(crawl_id, pages_buffer)
pages_buffer.append(
self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
@ -76,7 +76,7 @@ class PageOps:
# Add any remaining pages in buffer to db
if pages_buffer:
await self._add_pages_to_db(pages_buffer)
await self._add_pages_to_db(crawl_id, pages_buffer)
print(f"Added pages for crawl {crawl_id} to db", flush=True)
# pylint: disable=broad-exception-caught, raise-missing-from
@ -84,7 +84,9 @@ class PageOps:
traceback.print_exc()
print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)
def _get_page_from_dict(self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID):
def _get_page_from_dict(
self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID
) -> Page:
"""Return Page object from dict"""
page_id = page_dict.get("id")
if not page_id:
@ -94,7 +96,7 @@ class PageOps:
if not status and page_dict.get("loadState"):
status = 200
return Page(
p = Page(
id=page_id,
oid=oid,
crawl_id=crawl_id,
@ -109,8 +111,10 @@ class PageOps:
else datetime.now()
),
)
p.compute_page_type()
return p
async def _add_pages_to_db(self, pages: List[Page]):
async def _add_pages_to_db(self, crawl_id: str, pages: List[Page]):
"""Add batch of pages to db in one insert"""
result = await self.pages.insert_many(
[
@ -124,6 +128,8 @@ class PageOps:
# pylint: disable=broad-exception-raised
raise Exception("No pages inserted")
await self.update_crawl_file_and_error_counts(crawl_id, pages)
async def add_page_to_db(
self,
page_dict: Dict[str, Any],
@ -133,12 +139,9 @@ class PageOps:
):
"""Add page to database"""
page = self._get_page_from_dict(page_dict, crawl_id, oid)
print(f"PAGE: {page}", flush=True)
page_to_insert = page.to_dict(
exclude_unset=True, exclude_none=True, exclude_defaults=True
)
print(f"PAGE TO INSERT: {page_to_insert}")
try:
await self.pages.insert_one(page_to_insert)
@ -153,6 +156,9 @@ class PageOps:
)
return
if not qa_run_id and page:
await self.update_crawl_file_and_error_counts(crawl_id, [page])
# qa data
if qa_run_id and page:
compare_dict = page_dict.get("comparison")
@ -165,6 +171,39 @@ class PageOps:
await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare)
async def update_crawl_file_and_error_counts(
self, crawl_id: str, pages: List[Page]
):
"""Update crawl filePageCount and errorPageCount for pages."""
file_count = 0
error_count = 0
for page in pages:
if page.isFile:
file_count += 1
if page.isError:
error_count += 1
if file_count == 0 and error_count == 0:
return
inc_query = {}
if file_count > 0:
inc_query["filePageCount"] = file_count
if error_count > 0:
inc_query["errorPageCount"] = error_count
await self.crawls.find_one_and_update(
{
"_id": crawl_id,
"type": "crawl",
},
{"$inc": inc_query},
)
async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None):
"""Delete crawl pages from db"""
query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id}
@ -501,34 +540,6 @@ class PageOps:
return [PageOut.from_dict(data) for data in items], total
async def get_crawl_file_count(self, crawl_id: str):
"""Get count of pages in crawl that are files and don't need to be QAed"""
aggregate = [
{
"$match": {
"crawl_id": crawl_id,
"loadState": 2,
"mime": {"$not": {"$regex": "^.*html", "$options": "i"}},
}
},
{"$count": "count"},
]
cursor = self.pages.aggregate(aggregate)
results = await cursor.to_list(length=1)
if not results:
return 0
result = results[0]
try:
total = int(result["count"])
except (IndexError, ValueError):
total = 0
return total
async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
"""Delete existing pages for crawl and re-add from WACZs."""
await self.delete_crawl_pages(crawl_id, oid)

View File

@ -196,6 +196,18 @@ def test_crawls_exclude_full_seeds(admin_auth_headers, default_org_id, admin_cra
assert config is None or config.get("seeds") is None
def test_crawls_include_file_error_page_counts(
admin_auth_headers, default_org_id, admin_crawl_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
assert data["filePageCount"] >= 0
assert data["errorPageCount"] >= 0
def test_download_wacz():
r = requests.get(HOST_PREFIX + wacz_path)
assert r.status_code == 200
@ -474,6 +486,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page["loadState"]
assert page["status"]
assert page["mime"]
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)
# Test GET page endpoint
global page_id
@ -493,6 +507,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["mime"]
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)
assert page["notes"] == []
assert page.get("userid") is None
@ -591,6 +607,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["mime"]
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)
assert page["notes"] == []
assert page["userid"]
@ -668,6 +686,8 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
assert page["loadState"]
assert page["status"]
assert page["mime"]
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)
# Ensure only superuser can re-add pages for all crawls in an org
r = requests.post(