Backend: Move page file and error counts to crawl replay.json endpoint (#1868)
Backend work for #1859 - Remove file count from qa stats endpoint - Compute isFile or isError per page when page is added - Increment filePageCount and errorPageCount per crawl to count number of isFile or isError pages - Add file and error counts to crawl replay.json endpoint (filePageCount and errorPageCount) - Add migration 0028 to set isFile / isError for each page, aggregate filePageCount / errorPageCount per crawl - Determine if page is a file based on loadState == 2, mime type or status code and lack of title
This commit is contained in:
parent
16a720c685
commit
879e509b39
@ -17,7 +17,7 @@ from pymongo.errors import InvalidName
|
|||||||
from .migrations import BaseMigration
|
from .migrations import BaseMigration
|
||||||
|
|
||||||
|
|
||||||
CURR_DB_VERSION = "0027"
|
CURR_DB_VERSION = "0028"
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
@ -0,0 +1,71 @@
|
|||||||
|
"""
|
||||||
|
Migration 0028 - Page files and errors
|
||||||
|
"""
|
||||||
|
|
||||||
|
from btrixcloud.migrations import BaseMigration
|
||||||
|
from btrixcloud.models import Page, Crawl
|
||||||
|
|
||||||
|
|
||||||
|
MIGRATION_VERSION = "0028"
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(BaseMigration):
|
||||||
|
"""Migration class."""
|
||||||
|
|
||||||
|
# pylint: disable=unused-argument
|
||||||
|
def __init__(self, mdb, **kwargs):
|
||||||
|
super().__init__(mdb, migration_version=MIGRATION_VERSION)
|
||||||
|
|
||||||
|
async def migrate_up(self):
|
||||||
|
"""Perform migration up.
|
||||||
|
|
||||||
|
Update older crawls and their pages:
|
||||||
|
- Add crawl.filePageCount and crawl.errorPageCount
|
||||||
|
- Set Page.isFile and Page.isError
|
||||||
|
"""
|
||||||
|
pages_db = self.mdb["pages"]
|
||||||
|
crawls_db = self.mdb["crawls"]
|
||||||
|
|
||||||
|
cursor = crawls_db.find({"type": "crawl", "filePageCount": None})
|
||||||
|
async for crawl_dict in cursor:
|
||||||
|
try:
|
||||||
|
crawl = Crawl.from_dict(crawl_dict)
|
||||||
|
crawl.filePageCount = 0
|
||||||
|
crawl.errorPageCount = 0
|
||||||
|
|
||||||
|
cursor = pages_db.find({"crawl_id": crawl.id})
|
||||||
|
async for page_dict in cursor:
|
||||||
|
page = Page.from_dict(page_dict)
|
||||||
|
|
||||||
|
page.compute_page_type()
|
||||||
|
if page.isFile:
|
||||||
|
crawl.filePageCount += 1
|
||||||
|
|
||||||
|
if page.isError:
|
||||||
|
crawl.errorPageCount += 1
|
||||||
|
|
||||||
|
if page.isFile or page.isError:
|
||||||
|
await pages_db.find_one_and_update(
|
||||||
|
{"_id": page.id},
|
||||||
|
{
|
||||||
|
"$set": page.dict(
|
||||||
|
include={"isFile": True, "isError": True}
|
||||||
|
)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
await crawls_db.find_one_and_update(
|
||||||
|
{"_id": crawl.id, "type": "crawl"},
|
||||||
|
{
|
||||||
|
"$set": crawl.dict(
|
||||||
|
include={"filePageCount": True, "errorPageCount": True}
|
||||||
|
)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
# pylint: disable=broad-exception-caught
|
||||||
|
except Exception as err:
|
||||||
|
crawl_id = crawl_dict.get("_id")
|
||||||
|
print(
|
||||||
|
f"Error updating page counts and pages for crawl {crawl_id}: {err}",
|
||||||
|
flush=True,
|
||||||
|
)
|
@ -666,6 +666,9 @@ class CrawlOut(BaseMongoModel):
|
|||||||
lastQAState: Optional[str]
|
lastQAState: Optional[str]
|
||||||
lastQAStarted: Optional[datetime]
|
lastQAStarted: Optional[datetime]
|
||||||
|
|
||||||
|
filePageCount: Optional[int] = 0
|
||||||
|
errorPageCount: Optional[int] = 0
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class CrawlOutWithResources(CrawlOut):
|
class CrawlOutWithResources(CrawlOut):
|
||||||
@ -780,6 +783,9 @@ class Crawl(BaseCrawl, CrawlConfigCore):
|
|||||||
qa: Optional[QARun] = None
|
qa: Optional[QARun] = None
|
||||||
qaFinished: Optional[Dict[str, QARun]] = {}
|
qaFinished: Optional[Dict[str, QARun]] = {}
|
||||||
|
|
||||||
|
filePageCount: Optional[int] = 0
|
||||||
|
errorPageCount: Optional[int] = 0
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class CrawlCompleteIn(BaseModel):
|
class CrawlCompleteIn(BaseModel):
|
||||||
@ -1567,6 +1573,23 @@ class Page(BaseMongoModel):
|
|||||||
approved: Optional[bool] = None
|
approved: Optional[bool] = None
|
||||||
notes: List[PageNote] = []
|
notes: List[PageNote] = []
|
||||||
|
|
||||||
|
isFile: Optional[bool] = False
|
||||||
|
isError: Optional[bool] = False
|
||||||
|
|
||||||
|
def compute_page_type(self):
|
||||||
|
"""sets self.isFile or self.isError flags"""
|
||||||
|
self.isFile = False
|
||||||
|
self.isError = False
|
||||||
|
if self.loadState == 2:
|
||||||
|
# pylint: disable=unsupported-membership-test
|
||||||
|
if self.mime and "html" not in self.mime:
|
||||||
|
self.isFile = True
|
||||||
|
elif self.title is None and self.status == 200:
|
||||||
|
self.isFile = True
|
||||||
|
|
||||||
|
elif self.loadState == 0:
|
||||||
|
self.isError = True
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class PageWithAllQA(Page):
|
class PageWithAllQA(Page):
|
||||||
|
@ -36,7 +36,7 @@ else:
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# pylint: disable=too-many-instance-attributes, too-many-arguments
|
# pylint: disable=too-many-instance-attributes, too-many-arguments,too-many-public-methods
|
||||||
class PageOps:
|
class PageOps:
|
||||||
"""crawl pages"""
|
"""crawl pages"""
|
||||||
|
|
||||||
@ -68,7 +68,7 @@ class PageOps:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if len(pages_buffer) > batch_size:
|
if len(pages_buffer) > batch_size:
|
||||||
await self._add_pages_to_db(pages_buffer)
|
await self._add_pages_to_db(crawl_id, pages_buffer)
|
||||||
|
|
||||||
pages_buffer.append(
|
pages_buffer.append(
|
||||||
self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
|
self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
|
||||||
@ -76,7 +76,7 @@ class PageOps:
|
|||||||
|
|
||||||
# Add any remaining pages in buffer to db
|
# Add any remaining pages in buffer to db
|
||||||
if pages_buffer:
|
if pages_buffer:
|
||||||
await self._add_pages_to_db(pages_buffer)
|
await self._add_pages_to_db(crawl_id, pages_buffer)
|
||||||
|
|
||||||
print(f"Added pages for crawl {crawl_id} to db", flush=True)
|
print(f"Added pages for crawl {crawl_id} to db", flush=True)
|
||||||
# pylint: disable=broad-exception-caught, raise-missing-from
|
# pylint: disable=broad-exception-caught, raise-missing-from
|
||||||
@ -84,7 +84,9 @@ class PageOps:
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)
|
print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)
|
||||||
|
|
||||||
def _get_page_from_dict(self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID):
|
def _get_page_from_dict(
|
||||||
|
self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID
|
||||||
|
) -> Page:
|
||||||
"""Return Page object from dict"""
|
"""Return Page object from dict"""
|
||||||
page_id = page_dict.get("id")
|
page_id = page_dict.get("id")
|
||||||
if not page_id:
|
if not page_id:
|
||||||
@ -94,7 +96,7 @@ class PageOps:
|
|||||||
if not status and page_dict.get("loadState"):
|
if not status and page_dict.get("loadState"):
|
||||||
status = 200
|
status = 200
|
||||||
|
|
||||||
return Page(
|
p = Page(
|
||||||
id=page_id,
|
id=page_id,
|
||||||
oid=oid,
|
oid=oid,
|
||||||
crawl_id=crawl_id,
|
crawl_id=crawl_id,
|
||||||
@ -109,8 +111,10 @@ class PageOps:
|
|||||||
else datetime.now()
|
else datetime.now()
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
p.compute_page_type()
|
||||||
|
return p
|
||||||
|
|
||||||
async def _add_pages_to_db(self, pages: List[Page]):
|
async def _add_pages_to_db(self, crawl_id: str, pages: List[Page]):
|
||||||
"""Add batch of pages to db in one insert"""
|
"""Add batch of pages to db in one insert"""
|
||||||
result = await self.pages.insert_many(
|
result = await self.pages.insert_many(
|
||||||
[
|
[
|
||||||
@ -124,6 +128,8 @@ class PageOps:
|
|||||||
# pylint: disable=broad-exception-raised
|
# pylint: disable=broad-exception-raised
|
||||||
raise Exception("No pages inserted")
|
raise Exception("No pages inserted")
|
||||||
|
|
||||||
|
await self.update_crawl_file_and_error_counts(crawl_id, pages)
|
||||||
|
|
||||||
async def add_page_to_db(
|
async def add_page_to_db(
|
||||||
self,
|
self,
|
||||||
page_dict: Dict[str, Any],
|
page_dict: Dict[str, Any],
|
||||||
@ -133,12 +139,9 @@ class PageOps:
|
|||||||
):
|
):
|
||||||
"""Add page to database"""
|
"""Add page to database"""
|
||||||
page = self._get_page_from_dict(page_dict, crawl_id, oid)
|
page = self._get_page_from_dict(page_dict, crawl_id, oid)
|
||||||
print(f"PAGE: {page}", flush=True)
|
|
||||||
|
|
||||||
page_to_insert = page.to_dict(
|
page_to_insert = page.to_dict(
|
||||||
exclude_unset=True, exclude_none=True, exclude_defaults=True
|
exclude_unset=True, exclude_none=True, exclude_defaults=True
|
||||||
)
|
)
|
||||||
print(f"PAGE TO INSERT: {page_to_insert}")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await self.pages.insert_one(page_to_insert)
|
await self.pages.insert_one(page_to_insert)
|
||||||
@ -153,6 +156,9 @@ class PageOps:
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if not qa_run_id and page:
|
||||||
|
await self.update_crawl_file_and_error_counts(crawl_id, [page])
|
||||||
|
|
||||||
# qa data
|
# qa data
|
||||||
if qa_run_id and page:
|
if qa_run_id and page:
|
||||||
compare_dict = page_dict.get("comparison")
|
compare_dict = page_dict.get("comparison")
|
||||||
@ -165,6 +171,39 @@ class PageOps:
|
|||||||
|
|
||||||
await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare)
|
await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare)
|
||||||
|
|
||||||
|
async def update_crawl_file_and_error_counts(
|
||||||
|
self, crawl_id: str, pages: List[Page]
|
||||||
|
):
|
||||||
|
"""Update crawl filePageCount and errorPageCount for pages."""
|
||||||
|
file_count = 0
|
||||||
|
error_count = 0
|
||||||
|
|
||||||
|
for page in pages:
|
||||||
|
if page.isFile:
|
||||||
|
file_count += 1
|
||||||
|
|
||||||
|
if page.isError:
|
||||||
|
error_count += 1
|
||||||
|
|
||||||
|
if file_count == 0 and error_count == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
inc_query = {}
|
||||||
|
|
||||||
|
if file_count > 0:
|
||||||
|
inc_query["filePageCount"] = file_count
|
||||||
|
|
||||||
|
if error_count > 0:
|
||||||
|
inc_query["errorPageCount"] = error_count
|
||||||
|
|
||||||
|
await self.crawls.find_one_and_update(
|
||||||
|
{
|
||||||
|
"_id": crawl_id,
|
||||||
|
"type": "crawl",
|
||||||
|
},
|
||||||
|
{"$inc": inc_query},
|
||||||
|
)
|
||||||
|
|
||||||
async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None):
|
async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None):
|
||||||
"""Delete crawl pages from db"""
|
"""Delete crawl pages from db"""
|
||||||
query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id}
|
query: Dict[str, Union[str, UUID]] = {"crawl_id": crawl_id}
|
||||||
@ -501,34 +540,6 @@ class PageOps:
|
|||||||
|
|
||||||
return [PageOut.from_dict(data) for data in items], total
|
return [PageOut.from_dict(data) for data in items], total
|
||||||
|
|
||||||
async def get_crawl_file_count(self, crawl_id: str):
|
|
||||||
"""Get count of pages in crawl that are files and don't need to be QAed"""
|
|
||||||
aggregate = [
|
|
||||||
{
|
|
||||||
"$match": {
|
|
||||||
"crawl_id": crawl_id,
|
|
||||||
"loadState": 2,
|
|
||||||
"mime": {"$not": {"$regex": "^.*html", "$options": "i"}},
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{"$count": "count"},
|
|
||||||
]
|
|
||||||
|
|
||||||
cursor = self.pages.aggregate(aggregate)
|
|
||||||
results = await cursor.to_list(length=1)
|
|
||||||
|
|
||||||
if not results:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
result = results[0]
|
|
||||||
|
|
||||||
try:
|
|
||||||
total = int(result["count"])
|
|
||||||
except (IndexError, ValueError):
|
|
||||||
total = 0
|
|
||||||
|
|
||||||
return total
|
|
||||||
|
|
||||||
async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
|
async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
|
||||||
"""Delete existing pages for crawl and re-add from WACZs."""
|
"""Delete existing pages for crawl and re-add from WACZs."""
|
||||||
await self.delete_crawl_pages(crawl_id, oid)
|
await self.delete_crawl_pages(crawl_id, oid)
|
||||||
|
@ -196,6 +196,18 @@ def test_crawls_exclude_full_seeds(admin_auth_headers, default_org_id, admin_cra
|
|||||||
assert config is None or config.get("seeds") is None
|
assert config is None or config.get("seeds") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawls_include_file_error_page_counts(
|
||||||
|
admin_auth_headers, default_org_id, admin_crawl_id
|
||||||
|
):
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
assert data["filePageCount"] >= 0
|
||||||
|
assert data["errorPageCount"] >= 0
|
||||||
|
|
||||||
|
|
||||||
def test_download_wacz():
|
def test_download_wacz():
|
||||||
r = requests.get(HOST_PREFIX + wacz_path)
|
r = requests.get(HOST_PREFIX + wacz_path)
|
||||||
assert r.status_code == 200
|
assert r.status_code == 200
|
||||||
@ -474,6 +486,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
|||||||
assert page["loadState"]
|
assert page["loadState"]
|
||||||
assert page["status"]
|
assert page["status"]
|
||||||
assert page["mime"]
|
assert page["mime"]
|
||||||
|
assert page["isError"] in (True, False)
|
||||||
|
assert page["isFile"] in (True, False)
|
||||||
|
|
||||||
# Test GET page endpoint
|
# Test GET page endpoint
|
||||||
global page_id
|
global page_id
|
||||||
@ -493,6 +507,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
|||||||
assert page.get("title") or page.get("title") is None
|
assert page.get("title") or page.get("title") is None
|
||||||
assert page["loadState"]
|
assert page["loadState"]
|
||||||
assert page["mime"]
|
assert page["mime"]
|
||||||
|
assert page["isError"] in (True, False)
|
||||||
|
assert page["isFile"] in (True, False)
|
||||||
|
|
||||||
assert page["notes"] == []
|
assert page["notes"] == []
|
||||||
assert page.get("userid") is None
|
assert page.get("userid") is None
|
||||||
@ -591,6 +607,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
|||||||
assert page.get("title") or page.get("title") is None
|
assert page.get("title") or page.get("title") is None
|
||||||
assert page["loadState"]
|
assert page["loadState"]
|
||||||
assert page["mime"]
|
assert page["mime"]
|
||||||
|
assert page["isError"] in (True, False)
|
||||||
|
assert page["isFile"] in (True, False)
|
||||||
|
|
||||||
assert page["notes"] == []
|
assert page["notes"] == []
|
||||||
assert page["userid"]
|
assert page["userid"]
|
||||||
@ -668,6 +686,8 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
|
|||||||
assert page["loadState"]
|
assert page["loadState"]
|
||||||
assert page["status"]
|
assert page["status"]
|
||||||
assert page["mime"]
|
assert page["mime"]
|
||||||
|
assert page["isError"] in (True, False)
|
||||||
|
assert page["isFile"] in (True, False)
|
||||||
|
|
||||||
# Ensure only superuser can re-add pages for all crawls in an org
|
# Ensure only superuser can re-add pages for all crawls in an org
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
|
Loading…
Reference in New Issue
Block a user