Add collection page list/search endpoint (#2354)

Fixes #2353

Adds a new endpoint to list pages in a collection, with filtering
available on `url` (exact match), `ts`, `urlPrefix`, `isSeed`, and
`depth`, as well as accompanying tests. Additional sort options have
been added as well.

These same filters and sort options have also been added to the crawl
pages endpoint.

Also fixes an issue where `isSeed` wasn't being set in the database when
false but only added on serialization, which was preventing filtering
from working as expected.
This commit is contained in:
Tessa Walsh 2025-02-10 19:44:37 -05:00 committed by GitHub
parent 001839a521
commit 98a45b0d85
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 423 additions and 23 deletions

View File

@ -248,7 +248,14 @@ def main() -> None:
upload_ops = init_uploads_api(*base_crawl_init)
page_ops = init_pages_api(
app, mdb, crawls, org_ops, storage_ops, background_job_ops, current_active_user
app,
mdb,
crawls,
org_ops,
storage_ops,
background_job_ops,
coll_ops,
current_active_user,
)
base_crawl_ops.set_page_ops(page_ops)

View File

@ -89,7 +89,9 @@ def init_ops() -> Tuple[
upload_ops = UploadOps(*base_crawl_init)
page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops)
page_ops = PageOps(
mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops
)
base_crawl_ops.set_page_ops(page_ops)
crawl_ops.set_page_ops(page_ops)

View File

@ -1,8 +1,12 @@
"""crawl pages"""
# pylint: disable=too-many-lines
import asyncio
import os
import re
import traceback
import urllib.parse
from datetime import datetime
from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
from uuid import UUID, uuid4
@ -37,11 +41,12 @@ from .utils import str_to_date, str_list_to_bools, dt_now
if TYPE_CHECKING:
from .background_jobs import BackgroundJobOps
from .colls import CollectionOps
from .crawls import CrawlOps
from .orgs import OrgOps
from .storages import StorageOps
else:
CrawlOps = StorageOps = OrgOps = BackgroundJobOps = object
CrawlOps = StorageOps = OrgOps = BackgroundJobOps = CollectionOps = object
# ============================================================================
@ -53,14 +58,18 @@ class PageOps:
org_ops: OrgOps
storage_ops: StorageOps
background_job_ops: BackgroundJobOps
coll_ops: CollectionOps
def __init__(self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops):
def __init__(
self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops
):
self.pages = mdb["pages"]
self.crawls = mdb["crawls"]
self.crawl_ops = crawl_ops
self.org_ops = org_ops
self.storage_ops = storage_ops
self.background_job_ops = background_job_ops
self.coll_ops = coll_ops
async def init_index(self):
"""init index for pages db collection"""
@ -82,6 +91,9 @@ class PageOps:
if not page_dict.get("url"):
continue
if not page_dict.get("isSeed"):
page_dict["isSeed"] = False
if len(pages_buffer) > batch_size:
await self._add_pages_to_db(crawl_id, pages_buffer)
pages_buffer = []
@ -210,9 +222,8 @@ class PageOps:
):
"""Add page to database"""
page = self._get_page_from_dict(page_dict, crawl_id, oid)
page_to_insert = page.to_dict(
exclude_unset=True, exclude_none=True, exclude_defaults=True
)
page_to_insert = page.to_dict(exclude_unset=True, exclude_none=True)
try:
await self.pages.insert_one(page_to_insert)
@ -492,6 +503,11 @@ class PageOps:
self,
crawl_id: str,
org: Optional[Organization] = None,
url: Optional[str] = None,
url_prefix: Optional[str] = None,
ts: Optional[datetime] = None,
is_seed: Optional[bool] = None,
depth: Optional[int] = None,
qa_run_id: Optional[str] = None,
qa_filter_by: Optional[str] = None,
qa_gte: Optional[float] = None,
@ -518,6 +534,23 @@ class PageOps:
if org:
query["oid"] = org.id
if url_prefix:
url_prefix = urllib.parse.unquote(url_prefix)
regex_pattern = f"^{re.escape(url_prefix)}"
query["url"] = {"$regex": regex_pattern, "$options": "i"}
elif url:
query["url"] = urllib.parse.unquote(url)
if ts:
query["ts"] = ts
if is_seed in (True, False):
query["isSeed"] = is_seed
if isinstance(depth, int):
query["depth"] = depth
if reviewed:
query["$or"] = [
{"approved": {"$ne": None}},
@ -562,7 +595,18 @@ class PageOps:
# Sorting options to add:
# - automated heuristics like screenshot_comparison (dict keyed by QA run id)
# - Ensure notes sorting works okay with notes in list
sort_fields = ("url", "title", "notes", "approved")
sort_fields = (
"url",
"title",
"notes",
"approved",
"ts",
"status",
"mime",
"filename",
"depth",
"isSeed",
)
qa_sort_fields = ("screenshotMatch", "textMatch")
if sort_by not in sort_fields and sort_by not in qa_sort_fields:
raise HTTPException(status_code=400, detail="invalid_sort_by")
@ -613,6 +657,101 @@ class PageOps:
return [PageOut.from_dict(data) for data in items], total
async def list_collection_pages(
self,
coll_id: UUID,
org: Optional[Organization] = None,
url: Optional[str] = None,
url_prefix: Optional[str] = None,
ts: Optional[datetime] = None,
is_seed: Optional[bool] = None,
depth: Optional[int] = None,
page_size: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sort_by: Optional[str] = None,
sort_direction: Optional[int] = -1,
) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]:
"""List all pages in collection, with optional filtering"""
# pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements
# Zero-index page for query
page = page - 1
skip = page_size * page
crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id)
query: dict[str, object] = {
"crawl_id": {"$in": crawl_ids},
}
if org:
query["oid"] = org.id
if url_prefix:
url_prefix = urllib.parse.unquote(url_prefix)
regex_pattern = f"^{re.escape(url_prefix)}"
query["url"] = {"$regex": regex_pattern, "$options": "i"}
elif url:
query["url"] = urllib.parse.unquote(url)
if ts:
query["ts"] = ts
if is_seed in (True, False):
query["isSeed"] = is_seed
if isinstance(depth, int):
query["depth"] = depth
aggregate = [{"$match": query}]
if sort_by:
# Sorting options to add:
# - automated heuristics like screenshot_comparison (dict keyed by QA run id)
# - Ensure notes sorting works okay with notes in list
sort_fields = (
"url",
"crawl_id",
"ts",
"status",
"mime",
"filename",
"depth",
"isSeed",
)
if sort_by not in sort_fields:
raise HTTPException(status_code=400, detail="invalid_sort_by")
if sort_direction not in (1, -1):
raise HTTPException(status_code=400, detail="invalid_sort_direction")
aggregate.extend([{"$sort": {sort_by: sort_direction}}])
aggregate.extend(
[
{
"$facet": {
"items": [
{"$skip": skip},
{"$limit": page_size},
],
"total": [{"$count": "count"}],
}
},
]
)
# Get total
cursor = self.pages.aggregate(aggregate)
results = await cursor.to_list(length=1)
result = results[0]
items = result["items"]
try:
total = int(result["total"][0]["count"])
except (IndexError, ValueError):
total = 0
return [PageOut.from_dict(data) for data in items], total
async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
"""Delete existing pages for crawl and re-add from WACZs."""
await self.delete_crawl_pages(crawl_id, oid)
@ -738,13 +877,14 @@ class PageOps:
# ============================================================================
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
def init_pages_api(
app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, user_dep
app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep
):
"""init pages API"""
# pylint: disable=invalid-name
ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops)
ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops)
org_viewer_dep = org_ops.org_viewer_dep
org_crawl_dep = org_ops.org_crawl_dep
@app.post(
@ -913,9 +1053,14 @@ def init_pages_api(
tags=["pages", "all-crawls"],
response_model=PaginatedPageOutResponse,
)
async def get_pages_list(
async def get_crawl_pages_list(
crawl_id: str,
org: Organization = Depends(org_crawl_dep),
url: Optional[str] = None,
urlPrefix: Optional[str] = None,
ts: Optional[datetime] = None,
isSeed: Optional[bool] = None,
depth: Optional[int] = None,
reviewed: Optional[bool] = None,
approved: Optional[str] = None,
hasNotes: Optional[bool] = None,
@ -932,6 +1077,11 @@ def init_pages_api(
pages, total = await ops.list_pages(
crawl_id=crawl_id,
org=org,
url=url,
url_prefix=urlPrefix,
ts=ts,
is_seed=isSeed,
depth=depth,
reviewed=reviewed,
approved=formatted_approved,
has_notes=hasNotes,
@ -942,6 +1092,40 @@ def init_pages_api(
)
return paginated_format(pages, total, page, pageSize)
@app.get(
"/orgs/{oid}/collections/{coll_id}/pages",
tags=["pages", "collections"],
response_model=PaginatedPageOutResponse,
)
async def get_collection_pages_list(
coll_id: UUID,
org: Organization = Depends(org_viewer_dep),
url: Optional[str] = None,
urlPrefix: Optional[str] = None,
ts: Optional[datetime] = None,
isSeed: Optional[bool] = None,
depth: Optional[int] = None,
pageSize: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sortBy: Optional[str] = None,
sortDirection: Optional[int] = -1,
):
"""Retrieve paginated list of pages in collection"""
pages, total = await ops.list_collection_pages(
coll_id=coll_id,
org=org,
url=url,
url_prefix=urlPrefix,
ts=ts,
is_seed=isSeed,
depth=depth,
page_size=pageSize,
page=page,
sort_by=sortBy,
sort_direction=sortDirection,
)
return paginated_format(pages, total, page, pageSize)
@app.get(
"/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/pages",
tags=["pages", "qa"],

View File

@ -232,7 +232,7 @@ def crawler_crawl_id(crawler_auth_headers, default_org_id):
"name": "Crawler User Test Crawl",
"description": "crawler test crawl",
"tags": ["wr-test-2"],
"config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 1},
"config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 3},
"crawlerChannel": "test",
}
r = requests.post(

View File

@ -582,6 +582,121 @@ def test_list_collections(
assert second_coll["dateLatest"]
def test_list_pages_in_collection(crawler_auth_headers, default_org_id):
# Test list endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 0
pages = data["items"]
assert pages
for page in pages:
assert page["id"]
assert page["oid"]
assert page["crawl_id"]
assert page["url"]
assert page["ts"]
assert page.get("title") or page.get("title") is None
assert page.get("loadState") or page.get("loadState") is None
assert page.get("status") or page.get("status") is None
assert page.get("mime") or page.get("mime") is None
assert page["isError"] in (None, True, False)
assert page["isFile"] in (None, True, False)
# Save info for page to test url and urlPrefix filters
coll_page = pages[0]
coll_page_id = coll_page["id"]
coll_page_url = coll_page["url"]
coll_page_ts = coll_page["ts"]
# Test exact url filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
for matching_page in data["items"]:
assert matching_page["url"] == coll_page_url
# Test exact url and ts filters together
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}&ts={coll_page_ts}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
for matching_page in data["items"]:
assert matching_page["url"] == coll_page_url
assert matching_page["ts"] == coll_page_ts
# Test urlPrefix filter
url_prefix = coll_page_url[:8]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?urlPrefix={url_prefix}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
found_matching_page = False
for page in data["items"]:
if page["id"] == coll_page_id and page["url"] == coll_page_url:
found_matching_page = True
assert found_matching_page
# Test isSeed filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=true",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
for page in data["items"]:
assert page["isSeed"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=false",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
for page in data["items"]:
assert page["isSeed"] is False
# Test depth filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=0",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
for page in data["items"]:
assert page["depth"] == 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
for page in data["items"]:
assert page["depth"] == 1
def test_remove_upload_from_collection(crawler_auth_headers, default_org_id):
# Remove upload
r = requests.post(

View File

@ -658,7 +658,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 0
assert data["total"] == 3
pages = data["items"]
assert pages
@ -682,7 +683,11 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
# Test GET page endpoint
global page_id
page_id = pages[0]["id"]
test_page = pages[0]
page_id = test_page["id"]
test_page_url = test_page["url"]
test_page_ts = test_page["ts"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
headers=crawler_auth_headers,
@ -710,13 +715,100 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page.get("modified") is None
assert page.get("approved") is None
# Test exact url filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?url={test_page_url}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
for matching_page in data["items"]:
assert matching_page["url"] == test_page_url
# Test exact url and ts filters together
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?url={test_page_url}&ts={test_page_ts}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
for matching_page in data["items"]:
assert matching_page["url"] == test_page_url
assert matching_page["ts"] == test_page_ts
# Test urlPrefix filter
url_prefix = test_page_url[:8]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?urlPrefix={url_prefix}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
found_matching_page = False
for page in data["items"]:
if page["id"] == page_id and page["url"] == test_page_url:
found_matching_page = True
assert found_matching_page
# Test isSeed filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?isSeed=True",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
for page in data["items"]:
assert page["isSeed"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?isSeed=False",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 2
for page in data["items"]:
assert page["isSeed"] is False
# Test depth filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?depth=0",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
for page in data["items"]:
assert page["depth"] == 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?depth=1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 2
for page in data["items"]:
assert page["depth"] == 1
def test_crawl_pages_qa_filters(crawler_auth_headers, default_org_id, crawler_crawl_id):
# Test reviewed filter (page has no notes or approved so should show up in false)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=False",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 1
assert r.json()["total"] == 3
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True",
@ -770,15 +862,15 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 0
assert r.json()["total"] == 2
# Test reviewed filter (page now approved so should show up in True)
# Test reviewed filter (page now approved so should show up in True, other pages show here)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=False",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 0
assert r.json()["total"] == 2
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True",
@ -853,7 +945,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 0
assert r.json()["total"] == 2
def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
@ -985,14 +1077,14 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 1
assert r.json()["total"] == 3
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=true,false,none",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 1
assert r.json()["total"] == 3
# Test reviewed filter (page now has notes so should show up in True)
r = requests.get(
@ -1000,7 +1092,7 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 0
assert r.json()["total"] == 2
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True",
@ -1015,7 +1107,7 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 0
assert r.json()["total"] == 2
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?hasNotes=True",