Backend: Move page file and error counts to crawl replay.json endpoint (#1868)
Backend work for #1859 - Remove file count from qa stats endpoint - Compute isFile or isError per page when page is added - Increment filePageCount and errorPageCount per crawl to count number of isFile or isError pages - Add file and error counts to crawl replay.json endpoint (filePageCount and errorPageCount) - Add migration 0028 to set isFile / isError for each page, aggregate filePageCount / errorPageCount per crawl - Determine if page is a file based on loadState == 2, mime type or status code and lack of title
This commit is contained in:
parent
16a720c685
commit
879e509b39
@ -17,7 +17,7 @@ from pymongo.errors import InvalidName
|
||||
from .migrations import BaseMigration
|
||||
|
||||
|
||||
CURR_DB_VERSION = "0027"
|
||||
CURR_DB_VERSION = "0028"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
@ -0,0 +1,71 @@
|
||||
"""
|
||||
Migration 0028 - Page files and errors
|
||||
"""
|
||||
|
||||
from btrixcloud.migrations import BaseMigration
|
||||
from btrixcloud.models import Page, Crawl
|
||||
|
||||
|
||||
MIGRATION_VERSION = "0028"
|
||||
|
||||
|
||||
class Migration(BaseMigration):
|
||||
"""Migration class."""
|
||||
|
||||
# pylint: disable=unused-argument
|
||||
def __init__(self, mdb, **kwargs):
|
||||
super().__init__(mdb, migration_version=MIGRATION_VERSION)
|
||||
|
||||
async def migrate_up(self):
|
||||
"""Perform migration up.
|
||||
|
||||
Update older crawls and their pages:
|
||||
- Add crawl.filePageCount and crawl.errorPageCount
|
||||
- Set Page.isFile and Page.isError
|
||||
"""
|
||||
pages_db = self.mdb["pages"]
|
||||
crawls_db = self.mdb["crawls"]
|
||||
|
||||
cursor = crawls_db.find({"type": "crawl", "filePageCount": None})
|
||||
async for crawl_dict in cursor:
|
||||
try:
|
||||
crawl = Crawl.from_dict(crawl_dict)
|
||||
crawl.filePageCount = 0
|
||||
crawl.errorPageCount = 0
|
||||
|
||||
cursor = pages_db.find({"crawl_id": crawl.id})
|
||||
async for page_dict in cursor:
|
||||
page = Page.from_dict(page_dict)
|
||||
|
||||
page.compute_page_type()
|
||||
if page.isFile:
|
||||
crawl.filePageCount += 1
|
||||
|
||||
if page.isError:
|
||||
crawl.errorPageCount += 1
|
||||
|
||||
if page.isFile or page.isError:
|
||||
await pages_db.find_one_and_update(
|
||||
{"_id": page.id},
|
||||
{
|
||||
"$set": page.dict(
|
||||
include={"isFile": True, "isError": True}
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
await crawls_db.find_one_and_update(
|
||||
{"_id": crawl.id, "type": "crawl"},
|
||||
{
|
||||
"$set": crawl.dict(
|
||||
include={"filePageCount": True, "errorPageCount": True}
|
||||
)
|
||||
},
|
||||
)
|
||||
# pylint: disable=broad-exception-caught
|
||||
except Exception as err:
|
||||
crawl_id = crawl_dict.get("_id")
|
||||
print(
|
||||
f"Error updating page counts and pages for crawl {crawl_id}: {err}",
|
||||
flush=True,
|
||||
)
|
@ -666,6 +666,9 @@ class CrawlOut(BaseMongoModel):
|
||||
lastQAState: Optional[str]
|
||||
lastQAStarted: Optional[datetime]
|
||||
|
||||
filePageCount: Optional[int] = 0
|
||||
errorPageCount: Optional[int] = 0
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CrawlOutWithResources(CrawlOut):
|
||||
@ -780,6 +783,9 @@ class Crawl(BaseCrawl, CrawlConfigCore):
|
||||
qa: Optional[QARun] = None
|
||||
qaFinished: Optional[Dict[str, QARun]] = {}
|
||||
|
||||
filePageCount: Optional[int] = 0
|
||||
errorPageCount: Optional[int] = 0
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CrawlCompleteIn(BaseModel):
|
||||
@ -1567,6 +1573,23 @@ class Page(BaseMongoModel):
|
||||
approved: Optional[bool] = None
|
||||
notes: List[PageNote] = []
|
||||
|
||||
isFile: Optional[bool] = False
|
||||
isError: Optional[bool] = False
|
||||
|
||||
def compute_page_type(self):
|
||||
"""sets self.isFile or self.isError flags"""
|
||||
self.isFile = False
|
||||
self.isError = False
|
||||
if self.loadState == 2:
|
||||
# pylint: disable=unsupported-membership-test
|
||||
if self.mime and "html" not in self.mime:
|
||||
self.isFile = True
|
||||
elif self.title is None and self.status == 200:
|
||||
self.isFile = True
|
||||
|
||||
elif self.loadState == 0:
|
||||
self.isError = True
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageWithAllQA(Page):
|
||||
|
@ -36,7 +36,7 @@ else:
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# pylint: disable=too-many-instance-attributes, too-many-arguments
|
||||
# pylint: disable=too-many-instance-attributes, too-many-arguments,too-many-public-methods
|
||||
class PageOps:
|
||||
"""crawl pages"""
|
||||
|
||||
@ -68,7 +68,7 @@ class PageOps:
|
||||
continue
|
||||
|
||||
if len(pages_buffer) > batch_size:
|
||||
await self._add_pages_to_db(pages_buffer)
|
||||
await self._add_pages_to_db(crawl_id, pages_buffer)
|
||||
|
||||
pages_buffer.append(
|
||||
self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
|
||||
@ -76,7 +76,7 @@ class PageOps:
|
||||
|
||||
# Add any remaining pages in buffer to db
|
||||
if pages_buffer:
|
||||
await self._add_pages_to_db(pages_buffer)
|
||||
await self._add_pages_to_db(crawl_id, pages_buffer)
|
||||
|
||||
print(f"Added pages for crawl {crawl_id} to db", flush=True)
|
||||
# pylint: disable=broad-exception-caught, raise-missing-from
|
||||
@ -84,7 +84,9 @@ class PageOps:
|
||||
traceback.print_exc()
|
||||
print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)
|
||||
|
||||
def _get_page_from_dict(self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID):
|
||||
def _get_page_from_dict(
|
||||
self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID
|
||||
) -> Page:
|
||||
"""Return Page object from dict"""
|
||||
page_id = page_dict.get("id")
|
||||
if not page_id:
|
||||
@ -94,7 +96,7 @@ class PageOps:
|
||||
if not status and page_dict.get("loadState"):
|
||||
status = 200
|
||||
|
||||
return Page(
|
||||
p = Page(
|
||||
id=page_id,
|
||||
oid=oid,
|
||||
crawl_id=crawl_id,
|
||||
@ -109,8 +111,10 @@ class PageOps:
|
||||
else datetime.now()
|
||||
),
|
||||
)
|
||||
p.compute_page_type()
|
||||
return p
|
||||
|
||||
async def _add_pages_to_db(self, pages: List[Page]):
|
||||
async def _add_pages_to_db(self, crawl_id: str, pages: List[Page]):
|
||||
"""Add batch of pages to db in one insert"""
|
||||
result = await self.pages.insert_many(
|
||||
[
|
||||
@ -124,6 +128,8 @@ class PageOps:
|
||||
# pylint: disable=broad-exception-raised
|
||||
raise Exception("No pages inserted")
|
||||
|
||||
await self.update_crawl_file_and_error_counts(crawl_id, pages)
|
||||
|
||||
async def add_page_to_db(
|
||||
self,
|
||||
page_dict: Dict[str, Any],
|
||||
@ -133,12 +139,9 @@ class PageOps:
|
||||
):
|
||||
"""Add page to database"""
|
||||
page = self._get_page_from_dict(page_dict, crawl_id, oid)
|
||||
print(f"PAGE: {page}", flush=True)
|
||||
|
||||
page_to_insert = page.to_dict(
|
||||
exclude_unset=True, exclude_none=True, exclude_defaults=True
|
||||
)
|
||||
print(f"PAGE TO INSERT: {page_to_insert}")
|
||||
|
||||
try:
|
||||
await self.pages.insert_one(page_to_insert)
|
||||
@ -153,6 +156,9 @@ class PageOps:
|
||||
)
|
||||
return
|
||||
|
||||
if not qa_run_id and page:
|
||||
await self.update_crawl_file_and_error_counts(crawl_id, [page])
|
||||
|
||||
# qa data
|
||||
if qa_run_id and page:
|
||||
compare_dict = page_dict.get("comparison")
|
||||
@ -165,6 +171,39 @@ class PageOps:
|
||||
|
||||
await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare)
|
||||
|
||||
async def update_crawl_file_and_error_counts(
|
||||
self, crawl_id: str, pages: List[Page]
|
||||
):
|
||||
"""Update crawl filePageCount and errorPageCount for pages."""
|
||||
file_count = 0
|
||||
error_count = 0
|
||||
|
||||
for page in pages:
|
||||
if page.isFile:
|
||||
file_count += 1
|
||||
|
||||
if page.isError:
|
||||
error_count += 1
|
||||
|
||||
if file_count == 0 and error_count == 0:
|
||||
return
|
||||
|
||||
inc_query = {}
|
||||
|
||||
if file_count > 0:
|
||||
inc_query["filePageCount"] = file_count
|
||||
|
||||
if error_count > 0:
|
||||
inc_query["errorPageCount"] = error_count
|
||||
|
||||
await self.crawls.find_one_and_update(
|
||||
{
|
||||
"_id": crawl_id,
|
||||
"type": "crawl",
|
||||
},
|
||||
{"$inc": inc_query},
|
||||
)
|
||||
|
||||
async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None):
|
||||
"""Delete crawl pages from db"""
|
||||
query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id}
|
||||
@ -501,34 +540,6 @@ class PageOps:
|
||||
|
||||
return [PageOut.from_dict(data) for data in items], total
|
||||
|
||||
async def get_crawl_file_count(self, crawl_id: str):
|
||||
"""Get count of pages in crawl that are files and don't need to be QAed"""
|
||||
aggregate = [
|
||||
{
|
||||
"$match": {
|
||||
"crawl_id": crawl_id,
|
||||
"loadState": 2,
|
||||
"mime": {"$not": {"$regex": "^.*html", "$options": "i"}},
|
||||
}
|
||||
},
|
||||
{"$count": "count"},
|
||||
]
|
||||
|
||||
cursor = self.pages.aggregate(aggregate)
|
||||
results = await cursor.to_list(length=1)
|
||||
|
||||
if not results:
|
||||
return 0
|
||||
|
||||
result = results[0]
|
||||
|
||||
try:
|
||||
total = int(result["count"])
|
||||
except (IndexError, ValueError):
|
||||
total = 0
|
||||
|
||||
return total
|
||||
|
||||
async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
|
||||
"""Delete existing pages for crawl and re-add from WACZs."""
|
||||
await self.delete_crawl_pages(crawl_id, oid)
|
||||
|
@ -196,6 +196,18 @@ def test_crawls_exclude_full_seeds(admin_auth_headers, default_org_id, admin_cra
|
||||
assert config is None or config.get("seeds") is None
|
||||
|
||||
|
||||
def test_crawls_include_file_error_page_counts(
|
||||
admin_auth_headers, default_org_id, admin_crawl_id
|
||||
):
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
assert data["filePageCount"] >= 0
|
||||
assert data["errorPageCount"] >= 0
|
||||
|
||||
|
||||
def test_download_wacz():
|
||||
r = requests.get(HOST_PREFIX + wacz_path)
|
||||
assert r.status_code == 200
|
||||
@ -474,6 +486,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
assert page["loadState"]
|
||||
assert page["status"]
|
||||
assert page["mime"]
|
||||
assert page["isError"] in (True, False)
|
||||
assert page["isFile"] in (True, False)
|
||||
|
||||
# Test GET page endpoint
|
||||
global page_id
|
||||
@ -493,6 +507,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
assert page.get("title") or page.get("title") is None
|
||||
assert page["loadState"]
|
||||
assert page["mime"]
|
||||
assert page["isError"] in (True, False)
|
||||
assert page["isFile"] in (True, False)
|
||||
|
||||
assert page["notes"] == []
|
||||
assert page.get("userid") is None
|
||||
@ -591,6 +607,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
assert page.get("title") or page.get("title") is None
|
||||
assert page["loadState"]
|
||||
assert page["mime"]
|
||||
assert page["isError"] in (True, False)
|
||||
assert page["isFile"] in (True, False)
|
||||
|
||||
assert page["notes"] == []
|
||||
assert page["userid"]
|
||||
@ -668,6 +686,8 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
|
||||
assert page["loadState"]
|
||||
assert page["status"]
|
||||
assert page["mime"]
|
||||
assert page["isError"] in (True, False)
|
||||
assert page["isFile"] in (True, False)
|
||||
|
||||
# Ensure only superuser can re-add pages for all crawls in an org
|
||||
r = requests.post(
|
||||
|
Loading…
Reference in New Issue
Block a user