From 98a45b0d85e0653f4cc2913a4640569c6fe9c172 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 10 Feb 2025 19:44:37 -0500 Subject: [PATCH] Add collection page list/search endpoint (#2354) Fixes #2353 Adds a new endpoint to list pages in a collection, with filtering available on `url` (exact match), `ts`, `urlPrefix`, `isSeed`, and `depth`, as well as accompanying tests. Additional sort options have been added as well. These same filters and sort options have also been added to the crawl pages endpoint. Also fixes an issue where `isSeed` wasn't being set in the database when false but only added on serialization, which was preventing filtering from working as expected. --- backend/btrixcloud/main.py | 9 +- backend/btrixcloud/ops.py | 4 +- backend/btrixcloud/pages.py | 202 +++++++++++++++++++++++++++++-- backend/test/conftest.py | 2 +- backend/test/test_collections.py | 115 ++++++++++++++++++ backend/test/test_run_crawl.py | 114 +++++++++++++++-- 6 files changed, 423 insertions(+), 23 deletions(-) diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 507db08f..618cfd1d 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -248,7 +248,14 @@ def main() -> None: upload_ops = init_uploads_api(*base_crawl_init) page_ops = init_pages_api( - app, mdb, crawls, org_ops, storage_ops, background_job_ops, current_active_user + app, + mdb, + crawls, + org_ops, + storage_ops, + background_job_ops, + coll_ops, + current_active_user, ) base_crawl_ops.set_page_ops(page_ops) diff --git a/backend/btrixcloud/ops.py b/backend/btrixcloud/ops.py index 7c0bf294..5a67acd0 100644 --- a/backend/btrixcloud/ops.py +++ b/backend/btrixcloud/ops.py @@ -89,7 +89,9 @@ def init_ops() -> Tuple[ upload_ops = UploadOps(*base_crawl_init) - page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops) + page_ops = PageOps( + mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops + ) base_crawl_ops.set_page_ops(page_ops) crawl_ops.set_page_ops(page_ops) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 4b53b5b9..ebabd297 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -1,8 +1,12 @@ """crawl pages""" +# pylint: disable=too-many-lines + import asyncio import os +import re import traceback +import urllib.parse from datetime import datetime from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union from uuid import UUID, uuid4 @@ -37,11 +41,12 @@ from .utils import str_to_date, str_list_to_bools, dt_now if TYPE_CHECKING: from .background_jobs import BackgroundJobOps + from .colls import CollectionOps from .crawls import CrawlOps from .orgs import OrgOps from .storages import StorageOps else: - CrawlOps = StorageOps = OrgOps = BackgroundJobOps = object + CrawlOps = StorageOps = OrgOps = BackgroundJobOps = CollectionOps = object # ============================================================================ @@ -53,14 +58,18 @@ class PageOps: org_ops: OrgOps storage_ops: StorageOps background_job_ops: BackgroundJobOps + coll_ops: CollectionOps - def __init__(self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops): + def __init__( + self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops + ): self.pages = mdb["pages"] self.crawls = mdb["crawls"] self.crawl_ops = crawl_ops self.org_ops = org_ops self.storage_ops = storage_ops self.background_job_ops = background_job_ops + self.coll_ops = coll_ops async def init_index(self): """init index for pages db collection""" @@ -82,6 +91,9 @@ class PageOps: if not page_dict.get("url"): continue + if not page_dict.get("isSeed"): + page_dict["isSeed"] = False + if len(pages_buffer) > batch_size: await self._add_pages_to_db(crawl_id, pages_buffer) pages_buffer = [] @@ -210,9 +222,8 @@ class PageOps: ): """Add page to database""" page = self._get_page_from_dict(page_dict, crawl_id, oid) - page_to_insert = page.to_dict( - exclude_unset=True, exclude_none=True, exclude_defaults=True - ) + + page_to_insert = page.to_dict(exclude_unset=True, exclude_none=True) try: await self.pages.insert_one(page_to_insert) @@ -492,6 +503,11 @@ class PageOps: self, crawl_id: str, org: Optional[Organization] = None, + url: Optional[str] = None, + url_prefix: Optional[str] = None, + ts: Optional[datetime] = None, + is_seed: Optional[bool] = None, + depth: Optional[int] = None, qa_run_id: Optional[str] = None, qa_filter_by: Optional[str] = None, qa_gte: Optional[float] = None, @@ -518,6 +534,23 @@ class PageOps: if org: query["oid"] = org.id + if url_prefix: + url_prefix = urllib.parse.unquote(url_prefix) + regex_pattern = f"^{re.escape(url_prefix)}" + query["url"] = {"$regex": regex_pattern, "$options": "i"} + + elif url: + query["url"] = urllib.parse.unquote(url) + + if ts: + query["ts"] = ts + + if is_seed in (True, False): + query["isSeed"] = is_seed + + if isinstance(depth, int): + query["depth"] = depth + if reviewed: query["$or"] = [ {"approved": {"$ne": None}}, @@ -562,7 +595,18 @@ class PageOps: # Sorting options to add: # - automated heuristics like screenshot_comparison (dict keyed by QA run id) # - Ensure notes sorting works okay with notes in list - sort_fields = ("url", "title", "notes", "approved") + sort_fields = ( + "url", + "title", + "notes", + "approved", + "ts", + "status", + "mime", + "filename", + "depth", + "isSeed", + ) qa_sort_fields = ("screenshotMatch", "textMatch") if sort_by not in sort_fields and sort_by not in qa_sort_fields: raise HTTPException(status_code=400, detail="invalid_sort_by") @@ -613,6 +657,101 @@ class PageOps: return [PageOut.from_dict(data) for data in items], total + async def list_collection_pages( + self, + coll_id: UUID, + org: Optional[Organization] = None, + url: Optional[str] = None, + url_prefix: Optional[str] = None, + ts: Optional[datetime] = None, + is_seed: Optional[bool] = None, + depth: Optional[int] = None, + page_size: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sort_by: Optional[str] = None, + sort_direction: Optional[int] = -1, + ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]: + """List all pages in collection, with optional filtering""" + # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements + # Zero-index page for query + page = page - 1 + skip = page_size * page + + crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id) + + query: dict[str, object] = { + "crawl_id": {"$in": crawl_ids}, + } + if org: + query["oid"] = org.id + + if url_prefix: + url_prefix = urllib.parse.unquote(url_prefix) + regex_pattern = f"^{re.escape(url_prefix)}" + query["url"] = {"$regex": regex_pattern, "$options": "i"} + + elif url: + query["url"] = urllib.parse.unquote(url) + + if ts: + query["ts"] = ts + + if is_seed in (True, False): + query["isSeed"] = is_seed + + if isinstance(depth, int): + query["depth"] = depth + + aggregate = [{"$match": query}] + + if sort_by: + # Sorting options to add: + # - automated heuristics like screenshot_comparison (dict keyed by QA run id) + # - Ensure notes sorting works okay with notes in list + sort_fields = ( + "url", + "crawl_id", + "ts", + "status", + "mime", + "filename", + "depth", + "isSeed", + ) + if sort_by not in sort_fields: + raise HTTPException(status_code=400, detail="invalid_sort_by") + if sort_direction not in (1, -1): + raise HTTPException(status_code=400, detail="invalid_sort_direction") + + aggregate.extend([{"$sort": {sort_by: sort_direction}}]) + + aggregate.extend( + [ + { + "$facet": { + "items": [ + {"$skip": skip}, + {"$limit": page_size}, + ], + "total": [{"$count": "count"}], + } + }, + ] + ) + + # Get total + cursor = self.pages.aggregate(aggregate) + results = await cursor.to_list(length=1) + result = results[0] + items = result["items"] + + try: + total = int(result["total"][0]["count"]) + except (IndexError, ValueError): + total = 0 + + return [PageOut.from_dict(data) for data in items], total + async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): """Delete existing pages for crawl and re-add from WACZs.""" await self.delete_crawl_pages(crawl_id, oid) @@ -738,13 +877,14 @@ class PageOps: # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme def init_pages_api( - app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, user_dep + app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep ): """init pages API""" # pylint: disable=invalid-name - ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops) + ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops) + org_viewer_dep = org_ops.org_viewer_dep org_crawl_dep = org_ops.org_crawl_dep @app.post( @@ -913,9 +1053,14 @@ def init_pages_api( tags=["pages", "all-crawls"], response_model=PaginatedPageOutResponse, ) - async def get_pages_list( + async def get_crawl_pages_list( crawl_id: str, org: Organization = Depends(org_crawl_dep), + url: Optional[str] = None, + urlPrefix: Optional[str] = None, + ts: Optional[datetime] = None, + isSeed: Optional[bool] = None, + depth: Optional[int] = None, reviewed: Optional[bool] = None, approved: Optional[str] = None, hasNotes: Optional[bool] = None, @@ -932,6 +1077,11 @@ def init_pages_api( pages, total = await ops.list_pages( crawl_id=crawl_id, org=org, + url=url, + url_prefix=urlPrefix, + ts=ts, + is_seed=isSeed, + depth=depth, reviewed=reviewed, approved=formatted_approved, has_notes=hasNotes, @@ -942,6 +1092,40 @@ def init_pages_api( ) return paginated_format(pages, total, page, pageSize) + @app.get( + "/orgs/{oid}/collections/{coll_id}/pages", + tags=["pages", "collections"], + response_model=PaginatedPageOutResponse, + ) + async def get_collection_pages_list( + coll_id: UUID, + org: Organization = Depends(org_viewer_dep), + url: Optional[str] = None, + urlPrefix: Optional[str] = None, + ts: Optional[datetime] = None, + isSeed: Optional[bool] = None, + depth: Optional[int] = None, + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sortBy: Optional[str] = None, + sortDirection: Optional[int] = -1, + ): + """Retrieve paginated list of pages in collection""" + pages, total = await ops.list_collection_pages( + coll_id=coll_id, + org=org, + url=url, + url_prefix=urlPrefix, + ts=ts, + is_seed=isSeed, + depth=depth, + page_size=pageSize, + page=page, + sort_by=sortBy, + sort_direction=sortDirection, + ) + return paginated_format(pages, total, page, pageSize) + @app.get( "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/pages", tags=["pages", "qa"], diff --git a/backend/test/conftest.py b/backend/test/conftest.py index d2f41923..b9a6e717 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -232,7 +232,7 @@ def crawler_crawl_id(crawler_auth_headers, default_org_id): "name": "Crawler User Test Crawl", "description": "crawler test crawl", "tags": ["wr-test-2"], - "config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 1}, + "config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 3}, "crawlerChannel": "test", } r = requests.post( diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index e219134b..73fd1cba 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -582,6 +582,121 @@ def test_list_collections( assert second_coll["dateLatest"] +def test_list_pages_in_collection(crawler_auth_headers, default_org_id): + # Test list endpoint + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 0 + + pages = data["items"] + assert pages + + for page in pages: + assert page["id"] + assert page["oid"] + assert page["crawl_id"] + assert page["url"] + assert page["ts"] + assert page.get("title") or page.get("title") is None + assert page.get("loadState") or page.get("loadState") is None + assert page.get("status") or page.get("status") is None + assert page.get("mime") or page.get("mime") is None + assert page["isError"] in (None, True, False) + assert page["isFile"] in (None, True, False) + + # Save info for page to test url and urlPrefix filters + coll_page = pages[0] + coll_page_id = coll_page["id"] + coll_page_url = coll_page["url"] + coll_page_ts = coll_page["ts"] + + # Test exact url filter + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + for matching_page in data["items"]: + assert matching_page["url"] == coll_page_url + + # Test exact url and ts filters together + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}&ts={coll_page_ts}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + for matching_page in data["items"]: + assert matching_page["url"] == coll_page_url + assert matching_page["ts"] == coll_page_ts + + # Test urlPrefix filter + url_prefix = coll_page_url[:8] + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?urlPrefix={url_prefix}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + + found_matching_page = False + for page in data["items"]: + if page["id"] == coll_page_id and page["url"] == coll_page_url: + found_matching_page = True + + assert found_matching_page + + # Test isSeed filter + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=true", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + for page in data["items"]: + assert page["isSeed"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=false", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + for page in data["items"]: + assert page["isSeed"] is False + + # Test depth filter + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=0", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + for page in data["items"]: + assert page["depth"] == 0 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=1", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + for page in data["items"]: + assert page["depth"] == 1 + + def test_remove_upload_from_collection(crawler_auth_headers, default_org_id): # Remove upload r = requests.post( diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 511c4c6c..51cec3cb 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -658,7 +658,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): ) assert r.status_code == 200 data = r.json() - assert data["total"] >= 0 + + assert data["total"] == 3 pages = data["items"] assert pages @@ -682,7 +683,11 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): # Test GET page endpoint global page_id - page_id = pages[0]["id"] + test_page = pages[0] + page_id = test_page["id"] + test_page_url = test_page["url"] + test_page_ts = test_page["ts"] + r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}", headers=crawler_auth_headers, @@ -710,13 +715,100 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page.get("modified") is None assert page.get("approved") is None + # Test exact url filter + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?url={test_page_url}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + for matching_page in data["items"]: + assert matching_page["url"] == test_page_url + + # Test exact url and ts filters together + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?url={test_page_url}&ts={test_page_ts}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + for matching_page in data["items"]: + assert matching_page["url"] == test_page_url + assert matching_page["ts"] == test_page_ts + + # Test urlPrefix filter + url_prefix = test_page_url[:8] + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?urlPrefix={url_prefix}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + + found_matching_page = False + for page in data["items"]: + if page["id"] == page_id and page["url"] == test_page_url: + found_matching_page = True + + assert found_matching_page + + # Test isSeed filter + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?isSeed=True", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 1 + for page in data["items"]: + assert page["isSeed"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?isSeed=False", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 2 + for page in data["items"]: + assert page["isSeed"] is False + + # Test depth filter + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?depth=0", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 1 + for page in data["items"]: + assert page["depth"] == 0 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?depth=1", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 2 + for page in data["items"]: + assert page["depth"] == 1 + + +def test_crawl_pages_qa_filters(crawler_auth_headers, default_org_id, crawler_crawl_id): # Test reviewed filter (page has no notes or approved so should show up in false) r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=False", headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 1 + assert r.json()["total"] == 3 r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True", @@ -770,15 +862,15 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 0 + assert r.json()["total"] == 2 - # Test reviewed filter (page now approved so should show up in True) + # Test reviewed filter (page now approved so should show up in True, other pages show here) r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=False", headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 0 + assert r.json()["total"] == 2 r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True", @@ -853,7 +945,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 0 + assert r.json()["total"] == 2 def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): @@ -985,14 +1077,14 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 1 + assert r.json()["total"] == 3 r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=true,false,none", headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 1 + assert r.json()["total"] == 3 # Test reviewed filter (page now has notes so should show up in True) r = requests.get( @@ -1000,7 +1092,7 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 0 + assert r.json()["total"] == 2 r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True", @@ -1015,7 +1107,7 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 0 + assert r.json()["total"] == 2 r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?hasNotes=True",