diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 507db08f..618cfd1d 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -248,7 +248,14 @@ def main() -> None: upload_ops = init_uploads_api(*base_crawl_init) page_ops = init_pages_api( - app, mdb, crawls, org_ops, storage_ops, background_job_ops, current_active_user + app, + mdb, + crawls, + org_ops, + storage_ops, + background_job_ops, + coll_ops, + current_active_user, ) base_crawl_ops.set_page_ops(page_ops) diff --git a/backend/btrixcloud/ops.py b/backend/btrixcloud/ops.py index 7c0bf294..5a67acd0 100644 --- a/backend/btrixcloud/ops.py +++ b/backend/btrixcloud/ops.py @@ -89,7 +89,9 @@ def init_ops() -> Tuple[ upload_ops = UploadOps(*base_crawl_init) - page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops) + page_ops = PageOps( + mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops + ) base_crawl_ops.set_page_ops(page_ops) crawl_ops.set_page_ops(page_ops) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 4b53b5b9..ebabd297 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -1,8 +1,12 @@ """crawl pages""" +# pylint: disable=too-many-lines + import asyncio import os +import re import traceback +import urllib.parse from datetime import datetime from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union from uuid import UUID, uuid4 @@ -37,11 +41,12 @@ from .utils import str_to_date, str_list_to_bools, dt_now if TYPE_CHECKING: from .background_jobs import BackgroundJobOps + from .colls import CollectionOps from .crawls import CrawlOps from .orgs import OrgOps from .storages import StorageOps else: - CrawlOps = StorageOps = OrgOps = BackgroundJobOps = object + CrawlOps = StorageOps = OrgOps = BackgroundJobOps = CollectionOps = object # ============================================================================ @@ -53,14 +58,18 @@ class PageOps: org_ops: OrgOps storage_ops: StorageOps background_job_ops: BackgroundJobOps + coll_ops: CollectionOps - def __init__(self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops): + def __init__( + self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops + ): self.pages = mdb["pages"] self.crawls = mdb["crawls"] self.crawl_ops = crawl_ops self.org_ops = org_ops self.storage_ops = storage_ops self.background_job_ops = background_job_ops + self.coll_ops = coll_ops async def init_index(self): """init index for pages db collection""" @@ -82,6 +91,9 @@ class PageOps: if not page_dict.get("url"): continue + if not page_dict.get("isSeed"): + page_dict["isSeed"] = False + if len(pages_buffer) > batch_size: await self._add_pages_to_db(crawl_id, pages_buffer) pages_buffer = [] @@ -210,9 +222,8 @@ class PageOps: ): """Add page to database""" page = self._get_page_from_dict(page_dict, crawl_id, oid) - page_to_insert = page.to_dict( - exclude_unset=True, exclude_none=True, exclude_defaults=True - ) + + page_to_insert = page.to_dict(exclude_unset=True, exclude_none=True) try: await self.pages.insert_one(page_to_insert) @@ -492,6 +503,11 @@ class PageOps: self, crawl_id: str, org: Optional[Organization] = None, + url: Optional[str] = None, + url_prefix: Optional[str] = None, + ts: Optional[datetime] = None, + is_seed: Optional[bool] = None, + depth: Optional[int] = None, qa_run_id: Optional[str] = None, qa_filter_by: Optional[str] = None, qa_gte: Optional[float] = None, @@ -518,6 +534,23 @@ class PageOps: if org: query["oid"] = org.id + if url_prefix: + url_prefix = urllib.parse.unquote(url_prefix) + regex_pattern = f"^{re.escape(url_prefix)}" + query["url"] = {"$regex": regex_pattern, "$options": "i"} + + elif url: + query["url"] = urllib.parse.unquote(url) + + if ts: + query["ts"] = ts + + if is_seed in (True, False): + query["isSeed"] = is_seed + + if isinstance(depth, int): + query["depth"] = depth + if reviewed: query["$or"] = [ {"approved": {"$ne": None}}, @@ -562,7 +595,18 @@ class PageOps: # Sorting options to add: # - automated heuristics like screenshot_comparison (dict keyed by QA run id) # - Ensure notes sorting works okay with notes in list - sort_fields = ("url", "title", "notes", "approved") + sort_fields = ( + "url", + "title", + "notes", + "approved", + "ts", + "status", + "mime", + "filename", + "depth", + "isSeed", + ) qa_sort_fields = ("screenshotMatch", "textMatch") if sort_by not in sort_fields and sort_by not in qa_sort_fields: raise HTTPException(status_code=400, detail="invalid_sort_by") @@ -613,6 +657,101 @@ class PageOps: return [PageOut.from_dict(data) for data in items], total + async def list_collection_pages( + self, + coll_id: UUID, + org: Optional[Organization] = None, + url: Optional[str] = None, + url_prefix: Optional[str] = None, + ts: Optional[datetime] = None, + is_seed: Optional[bool] = None, + depth: Optional[int] = None, + page_size: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sort_by: Optional[str] = None, + sort_direction: Optional[int] = -1, + ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]: + """List all pages in collection, with optional filtering""" + # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements + # Zero-index page for query + page = page - 1 + skip = page_size * page + + crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id) + + query: dict[str, object] = { + "crawl_id": {"$in": crawl_ids}, + } + if org: + query["oid"] = org.id + + if url_prefix: + url_prefix = urllib.parse.unquote(url_prefix) + regex_pattern = f"^{re.escape(url_prefix)}" + query["url"] = {"$regex": regex_pattern, "$options": "i"} + + elif url: + query["url"] = urllib.parse.unquote(url) + + if ts: + query["ts"] = ts + + if is_seed in (True, False): + query["isSeed"] = is_seed + + if isinstance(depth, int): + query["depth"] = depth + + aggregate = [{"$match": query}] + + if sort_by: + # Sorting options to add: + # - automated heuristics like screenshot_comparison (dict keyed by QA run id) + # - Ensure notes sorting works okay with notes in list + sort_fields = ( + "url", + "crawl_id", + "ts", + "status", + "mime", + "filename", + "depth", + "isSeed", + ) + if sort_by not in sort_fields: + raise HTTPException(status_code=400, detail="invalid_sort_by") + if sort_direction not in (1, -1): + raise HTTPException(status_code=400, detail="invalid_sort_direction") + + aggregate.extend([{"$sort": {sort_by: sort_direction}}]) + + aggregate.extend( + [ + { + "$facet": { + "items": [ + {"$skip": skip}, + {"$limit": page_size}, + ], + "total": [{"$count": "count"}], + } + }, + ] + ) + + # Get total + cursor = self.pages.aggregate(aggregate) + results = await cursor.to_list(length=1) + result = results[0] + items = result["items"] + + try: + total = int(result["total"][0]["count"]) + except (IndexError, ValueError): + total = 0 + + return [PageOut.from_dict(data) for data in items], total + async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): """Delete existing pages for crawl and re-add from WACZs.""" await self.delete_crawl_pages(crawl_id, oid) @@ -738,13 +877,14 @@ class PageOps: # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme def init_pages_api( - app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, user_dep + app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep ): """init pages API""" # pylint: disable=invalid-name - ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops) + ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops) + org_viewer_dep = org_ops.org_viewer_dep org_crawl_dep = org_ops.org_crawl_dep @app.post( @@ -913,9 +1053,14 @@ def init_pages_api( tags=["pages", "all-crawls"], response_model=PaginatedPageOutResponse, ) - async def get_pages_list( + async def get_crawl_pages_list( crawl_id: str, org: Organization = Depends(org_crawl_dep), + url: Optional[str] = None, + urlPrefix: Optional[str] = None, + ts: Optional[datetime] = None, + isSeed: Optional[bool] = None, + depth: Optional[int] = None, reviewed: Optional[bool] = None, approved: Optional[str] = None, hasNotes: Optional[bool] = None, @@ -932,6 +1077,11 @@ def init_pages_api( pages, total = await ops.list_pages( crawl_id=crawl_id, org=org, + url=url, + url_prefix=urlPrefix, + ts=ts, + is_seed=isSeed, + depth=depth, reviewed=reviewed, approved=formatted_approved, has_notes=hasNotes, @@ -942,6 +1092,40 @@ def init_pages_api( ) return paginated_format(pages, total, page, pageSize) + @app.get( + "/orgs/{oid}/collections/{coll_id}/pages", + tags=["pages", "collections"], + response_model=PaginatedPageOutResponse, + ) + async def get_collection_pages_list( + coll_id: UUID, + org: Organization = Depends(org_viewer_dep), + url: Optional[str] = None, + urlPrefix: Optional[str] = None, + ts: Optional[datetime] = None, + isSeed: Optional[bool] = None, + depth: Optional[int] = None, + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sortBy: Optional[str] = None, + sortDirection: Optional[int] = -1, + ): + """Retrieve paginated list of pages in collection""" + pages, total = await ops.list_collection_pages( + coll_id=coll_id, + org=org, + url=url, + url_prefix=urlPrefix, + ts=ts, + is_seed=isSeed, + depth=depth, + page_size=pageSize, + page=page, + sort_by=sortBy, + sort_direction=sortDirection, + ) + return paginated_format(pages, total, page, pageSize) + @app.get( "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/pages", tags=["pages", "qa"], diff --git a/backend/test/conftest.py b/backend/test/conftest.py index d2f41923..b9a6e717 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -232,7 +232,7 @@ def crawler_crawl_id(crawler_auth_headers, default_org_id): "name": "Crawler User Test Crawl", "description": "crawler test crawl", "tags": ["wr-test-2"], - "config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 1}, + "config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 3}, "crawlerChannel": "test", } r = requests.post( diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index e219134b..73fd1cba 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -582,6 +582,121 @@ def test_list_collections( assert second_coll["dateLatest"] +def test_list_pages_in_collection(crawler_auth_headers, default_org_id): + # Test list endpoint + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 0 + + pages = data["items"] + assert pages + + for page in pages: + assert page["id"] + assert page["oid"] + assert page["crawl_id"] + assert page["url"] + assert page["ts"] + assert page.get("title") or page.get("title") is None + assert page.get("loadState") or page.get("loadState") is None + assert page.get("status") or page.get("status") is None + assert page.get("mime") or page.get("mime") is None + assert page["isError"] in (None, True, False) + assert page["isFile"] in (None, True, False) + + # Save info for page to test url and urlPrefix filters + coll_page = pages[0] + coll_page_id = coll_page["id"] + coll_page_url = coll_page["url"] + coll_page_ts = coll_page["ts"] + + # Test exact url filter + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + for matching_page in data["items"]: + assert matching_page["url"] == coll_page_url + + # Test exact url and ts filters together + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}&ts={coll_page_ts}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + for matching_page in data["items"]: + assert matching_page["url"] == coll_page_url + assert matching_page["ts"] == coll_page_ts + + # Test urlPrefix filter + url_prefix = coll_page_url[:8] + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?urlPrefix={url_prefix}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + + found_matching_page = False + for page in data["items"]: + if page["id"] == coll_page_id and page["url"] == coll_page_url: + found_matching_page = True + + assert found_matching_page + + # Test isSeed filter + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=true", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + for page in data["items"]: + assert page["isSeed"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=false", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + for page in data["items"]: + assert page["isSeed"] is False + + # Test depth filter + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=0", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + for page in data["items"]: + assert page["depth"] == 0 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=1", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + for page in data["items"]: + assert page["depth"] == 1 + + def test_remove_upload_from_collection(crawler_auth_headers, default_org_id): # Remove upload r = requests.post( diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 511c4c6c..51cec3cb 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -658,7 +658,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): ) assert r.status_code == 200 data = r.json() - assert data["total"] >= 0 + + assert data["total"] == 3 pages = data["items"] assert pages @@ -682,7 +683,11 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): # Test GET page endpoint global page_id - page_id = pages[0]["id"] + test_page = pages[0] + page_id = test_page["id"] + test_page_url = test_page["url"] + test_page_ts = test_page["ts"] + r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}", headers=crawler_auth_headers, @@ -710,13 +715,100 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page.get("modified") is None assert page.get("approved") is None + # Test exact url filter + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?url={test_page_url}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + for matching_page in data["items"]: + assert matching_page["url"] == test_page_url + + # Test exact url and ts filters together + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?url={test_page_url}&ts={test_page_ts}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + for matching_page in data["items"]: + assert matching_page["url"] == test_page_url + assert matching_page["ts"] == test_page_ts + + # Test urlPrefix filter + url_prefix = test_page_url[:8] + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?urlPrefix={url_prefix}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + + found_matching_page = False + for page in data["items"]: + if page["id"] == page_id and page["url"] == test_page_url: + found_matching_page = True + + assert found_matching_page + + # Test isSeed filter + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?isSeed=True", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 1 + for page in data["items"]: + assert page["isSeed"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?isSeed=False", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 2 + for page in data["items"]: + assert page["isSeed"] is False + + # Test depth filter + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?depth=0", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 1 + for page in data["items"]: + assert page["depth"] == 0 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?depth=1", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 2 + for page in data["items"]: + assert page["depth"] == 1 + + +def test_crawl_pages_qa_filters(crawler_auth_headers, default_org_id, crawler_crawl_id): # Test reviewed filter (page has no notes or approved so should show up in false) r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=False", headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 1 + assert r.json()["total"] == 3 r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True", @@ -770,15 +862,15 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 0 + assert r.json()["total"] == 2 - # Test reviewed filter (page now approved so should show up in True) + # Test reviewed filter (page now approved so should show up in True, other pages show here) r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=False", headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 0 + assert r.json()["total"] == 2 r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True", @@ -853,7 +945,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 0 + assert r.json()["total"] == 2 def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): @@ -985,14 +1077,14 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 1 + assert r.json()["total"] == 3 r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=true,false,none", headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 1 + assert r.json()["total"] == 3 # Test reviewed filter (page now has notes so should show up in True) r = requests.get( @@ -1000,7 +1092,7 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 0 + assert r.json()["total"] == 2 r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True", @@ -1015,7 +1107,7 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["total"] == 0 + assert r.json()["total"] == 2 r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?hasNotes=True",