Add mime field to Page model (#1678)

This commit is contained in:
Tessa Walsh 2024-04-17 00:57:49 -04:00 committed by GitHub
parent 1b034957ff
commit 87e0873f1a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 17 additions and 5 deletions

View File

@ -1529,6 +1529,7 @@ class Page(BaseMongoModel):
ts: Optional[datetime] = None
loadState: Optional[int] = None
status: Optional[int] = None
mime: Optional[str] = None
# manual review
userid: Optional[UUID] = None

View File

@ -101,6 +101,7 @@ class PageOps:
title=page_dict.get("title"),
loadState=page_dict.get("loadState"),
status=status,
mime=page_dict.get("mime", "text/html"),
ts=(
from_k8s_date(page_dict.get("ts"))
if page_dict.get("ts")
@ -131,13 +132,15 @@ class PageOps:
):
"""Add page to database"""
page = self._get_page_from_dict(page_dict, crawl_id, oid)
print(f"PAGE: {page}", flush=True)
page_to_insert = page.to_dict(
exclude_unset=True, exclude_none=True, exclude_defaults=True
)
print(f"PAGE TO INSERT: {page_to_insert}")
try:
await self.pages.insert_one(
page.to_dict(
exclude_unset=True, exclude_none=True, exclude_defaults=True
)
)
await self.pages.insert_one(page_to_insert)
except pymongo.errors.DuplicateKeyError:
pass

View File

@ -214,6 +214,8 @@ def test_qa_page_data(
assert page["title"] == "Webrecorder"
assert page["url"] == "https://webrecorder.net/"
assert page["mime"] == "text/html"
assert page["status"] == 200
assert page["qa"]["textMatch"] == 1.0
assert page["qa"]["screenshotMatch"] == 1.0
assert page["qa"]["resourceCounts"] == {
@ -231,6 +233,8 @@ def test_qa_page_data(
assert page["id"]
assert page["title"] == "Webrecorder"
assert page["url"] == "https://webrecorder.net/"
assert page["mime"] == "text/html"
assert page["status"] == 200
assert page["qa"]["textMatch"] == 1.0
assert page["qa"]["screenshotMatch"] == 1.0
assert page["qa"]["resourceCounts"] == {

View File

@ -435,6 +435,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["status"]
assert page["mime"]
# Test GET page endpoint
global page_id
@ -453,6 +454,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page["ts"]
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["mime"]
assert page["notes"] == []
assert page.get("userid") is None
@ -550,6 +552,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page["ts"]
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["mime"]
assert page["notes"] == []
assert page["userid"]
@ -626,6 +629,7 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["status"]
assert page["mime"]
# Ensure only superuser can re-add pages for all crawls in an org
r = requests.post(