diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 56e28409..b5bb52a3 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -3,6 +3,7 @@ from datetime import datetime, timedelta from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple from uuid import UUID +import os import urllib.parse import asyncio @@ -31,7 +32,7 @@ from .models import ( PRESIGN_DURATION_SECONDS, ) from .pagination import paginated_format, DEFAULT_PAGE_SIZE -from .utils import dt_now, date_to_str +from .utils import dt_now, date_to_str, get_origin if TYPE_CHECKING: from .crawlconfigs import CrawlConfigOps @@ -156,6 +157,7 @@ class BaseCrawlOps: org: Optional[Organization] = None, type_: Optional[str] = None, skip_resources=False, + headers: Optional[dict] = None, ) -> CrawlOutWithResources: """Get crawl data for api output""" res = await self.get_crawl_raw(crawlid, org, type_) @@ -168,6 +170,16 @@ class BaseCrawlOps: if coll_ids: res["collections"] = await self.colls.get_collection_names(coll_ids) + res["initialPages"], _ = await self.page_ops.list_pages( + crawlid, is_seed=True, page_size=25 + ) + + oid = res.get("oid") + if oid: + res["pagesQueryUrl"] = ( + get_origin(headers) + f"/api/orgs/{oid}/crawls/{crawlid}/pages" + ) + crawl = CrawlOutWithResources.from_dict(res) if not skip_resources: @@ -497,7 +509,7 @@ class BaseCrawlOps: out_files.append( CrawlFileOut( - name=file_.filename, + name=os.path.basename(file_.filename), path=presigned_url or "", hash=file_.hash, size=file_.size, diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 0ea137a0..2103c61e 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -53,16 +53,18 @@ from .models import ( ImageFilePreparer, MIN_UPLOAD_PART_SIZE, PublicCollOut, + PreloadResource, ) -from .utils import dt_now, slug_from_name, get_duplicate_key_error_field +from .utils import dt_now, slug_from_name, get_duplicate_key_error_field, get_origin if TYPE_CHECKING: from .orgs import OrgOps from .storages import StorageOps from .webhooks import EventWebhookOps from .crawls import CrawlOps + from .pages import PageOps else: - OrgOps = StorageOps = EventWebhookOps = CrawlOps = object + OrgOps = StorageOps = EventWebhookOps = CrawlOps = PageOps = object THUMBNAIL_MAX_SIZE = 2_000_000 @@ -78,6 +80,7 @@ class CollectionOps: storage_ops: StorageOps event_webhook_ops: EventWebhookOps crawl_ops: CrawlOps + page_ops: PageOps def __init__(self, mdb, storage_ops, orgs, event_webhook_ops): self.collections = mdb["collections"] @@ -337,12 +340,28 @@ class CollectionOps: org: Organization, resources=False, public_or_unlisted_only=False, + headers: Optional[dict] = None, ) -> CollOut: """Get CollOut by id""" + # pylint: disable=too-many-locals result = await self.get_collection_raw(coll_id, public_or_unlisted_only) if resources: - result["resources"] = await self.get_collection_crawl_resources(coll_id) + result["resources"], result["preloadResources"] = ( + await self.get_collection_crawl_resources( + coll_id, include_preloads=True + ) + ) + + result["initialPages"], result["totalPages"] = ( + await self.page_ops.list_collection_pages(coll_id, page_size=25) + ) + + public = "public/" if public_or_unlisted_only else "" + result["pagesQueryUrl"] = ( + get_origin(headers) + + f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages" + ) thumbnail = result.get("thumbnail") if thumbnail: @@ -369,7 +388,7 @@ class CollectionOps: if result.get("access") not in allowed_access: raise HTTPException(status_code=404, detail="collection_not_found") - result["resources"] = await self.get_collection_crawl_resources(coll_id) + result["resources"], _ = await self.get_collection_crawl_resources(coll_id) thumbnail = result.get("thumbnail") if thumbnail: @@ -468,7 +487,11 @@ class CollectionOps: collections: List[Union[CollOut, PublicCollOut]] = [] for res in items: - res["resources"] = await self.get_collection_crawl_resources(res["_id"]) + res["resources"], res["preloadResources"] = ( + await self.get_collection_crawl_resources( + res["_id"], include_preloads=not public_colls_out + ) + ) thumbnail = res.get("thumbnail") if thumbnail: @@ -490,12 +513,14 @@ class CollectionOps: return collections, total - async def get_collection_crawl_resources(self, coll_id: UUID): + async def get_collection_crawl_resources( + self, coll_id: UUID, include_preloads=False + ): """Return pre-signed resources for all collection crawl files.""" # Ensure collection exists _ = await self.get_collection_raw(coll_id) - all_files = [] + resources = [] crawls, _ = await self.crawl_ops.list_all_base_crawls( collection_id=coll_id, @@ -506,9 +531,36 @@ class CollectionOps: for crawl in crawls: if crawl.resources: - all_files.extend(crawl.resources) + resources.extend(crawl.resources) - return all_files + preload_resources: List[PreloadResource] = [] + + if include_preloads: + no_page_items = await self.get_collection_resources_with_no_pages(crawls) + for item in no_page_items: + preload_resources.append(item) + + return resources, preload_resources + + async def get_collection_resources_with_no_pages( + self, crawls: List[CrawlOutWithResources] + ) -> List[PreloadResource]: + """Return wacz files in collection that have no pages""" + resources_no_pages: List[PreloadResource] = [] + + for crawl in crawls: + _, page_count = await self.page_ops.list_pages(crawl.id) + if page_count == 0 and crawl.resources: + for resource in crawl.resources: + resources_no_pages.append( + PreloadResource( + name=os.path.basename(resource.name), + crawlId=crawl.id, + hasPages=False, + ) + ) + + return resources_no_pages async def get_collection_names(self, uuids: List[UUID]): """return object of {_id, names} given list of collection ids""" @@ -528,9 +580,15 @@ class CollectionOps: names = [name for name in names if name] return {"names": names} - async def get_collection_crawl_ids(self, coll_id: UUID) -> List[str]: - """Return list of crawl ids in collection""" + async def get_collection_crawl_ids( + self, coll_id: UUID, public_or_unlisted_only=False + ) -> List[str]: + """Return list of crawl ids in collection, including only public collections""" crawl_ids = [] + # ensure collection is public or unlisted, else throw here + if public_or_unlisted_only: + await self.get_collection_raw(coll_id, public_or_unlisted_only) + async for crawl_raw in self.crawls.find( {"collectionIds": coll_id}, projection=["_id"] ): @@ -1010,8 +1068,8 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de try: all_collections, _ = await colls.list_collections(org, page_size=10_000) for collection in all_collections: - results[collection.name] = await colls.get_collection_crawl_resources( - collection.id + results[collection.name], _ = ( + await colls.get_collection_crawl_resources(collection.id) ) except Exception as exc: # pylint: disable=raise-missing-from @@ -1047,9 +1105,11 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de response_model=CollOut, ) async def get_collection_replay( - coll_id: UUID, org: Organization = Depends(org_viewer_dep) + request: Request, coll_id: UUID, org: Organization = Depends(org_viewer_dep) ): - return await colls.get_collection_out(coll_id, org, resources=True) + return await colls.get_collection_out( + coll_id, org, resources=True, headers=dict(request.headers) + ) @app.get( "/orgs/{oid}/collections/{coll_id}/public/replay.json", @@ -1057,12 +1117,17 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de response_model=CollOut, ) async def get_collection_public_replay( + request: Request, response: Response, coll_id: UUID, org: Organization = Depends(org_public), ): coll = await colls.get_collection_out( - coll_id, org, resources=True, public_or_unlisted_only=True + coll_id, + org, + resources=True, + public_or_unlisted_only=True, + headers=dict(request.headers), ) response.headers["Access-Control-Allow-Origin"] = "*" response.headers["Access-Control-Allow-Headers"] = "*" diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 33f13415..41449b73 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -881,14 +881,6 @@ class CrawlOut(BaseMongoModel): errorPageCount: Optional[int] = 0 -# ============================================================================ -class CrawlOutWithResources(CrawlOut): - """Crawl output model including resources""" - - resources: Optional[List[CrawlFileOut]] = [] - collections: Optional[List[CollIdName]] = [] - - # ============================================================================ class UpdateCrawl(BaseModel): """Update crawl""" @@ -1222,6 +1214,173 @@ class ImageFilePreparer(FilePreparer): ) +# ============================================================================ + +### PAGES ### + + +# ============================================================================ +class PageReviewUpdate(BaseModel): + """Update model for page manual review/approval""" + + approved: Optional[bool] = None + + +# ============================================================================ +class PageNoteIn(BaseModel): + """Input model for adding page notes""" + + text: str + + +# ============================================================================ +class PageNoteEdit(BaseModel): + """Input model for editing page notes""" + + id: UUID + text: str + + +# ============================================================================ +class PageNoteDelete(BaseModel): + """Delete model for page notes""" + + delete_list: List[UUID] = [] + + +# ============================================================================ +class PageNote(BaseModel): + """Model for page notes, tracking user and time""" + + id: UUID + text: str + created: datetime + userid: UUID + userName: str + + +# ============================================================================ +class PageQACompare(BaseModel): + """Model for updating pages from QA run""" + + screenshotMatch: Optional[float] = None + textMatch: Optional[float] = None + resourceCounts: Optional[Dict[str, int]] = None + + +# ============================================================================ +class Page(BaseMongoModel): + """Core page data, no QA""" + + id: UUID + + oid: UUID + crawl_id: str + + # core page data + url: AnyHttpUrl + title: Optional[str] = None + ts: Optional[datetime] = None + loadState: Optional[int] = None + status: Optional[int] = None + mime: Optional[str] = None + filename: Optional[str] = None + depth: Optional[int] = None + favIconUrl: Optional[AnyHttpUrl] = None + isSeed: Optional[bool] = False + + # manual review + userid: Optional[UUID] = None + modified: Optional[datetime] = None + approved: Optional[bool] = None + notes: List[PageNote] = [] + + isFile: Optional[bool] = False + isError: Optional[bool] = False + + def compute_page_type(self): + """sets self.isFile or self.isError flags""" + self.isFile = False + self.isError = False + if self.loadState == 2: + # pylint: disable=unsupported-membership-test + if self.mime and "html" not in self.mime: + self.isFile = True + elif self.title is None and self.status == 200: + self.isFile = True + + elif self.loadState == 0: + self.isError = True + + +# ============================================================================ +class PageWithAllQA(Page): + """Model for core page data + qa""" + + # automated heuristics, keyed by QA run id + qa: Optional[Dict[str, PageQACompare]] = {} + + +# ============================================================================ +class PageOut(Page): + """Model for pages output, no QA""" + + status: int = 200 + + +# ============================================================================ +class PageOutWithSingleQA(Page): + """Page out with single QA entry""" + + qa: Optional[PageQACompare] = None + + +# ============================================================================ +class PageNoteAddedResponse(BaseModel): + """Model for response to adding page""" + + added: bool + data: PageNote + + +# ============================================================================ +class PageNoteUpdatedResponse(BaseModel): + """Model for response to updating page""" + + updated: bool + data: PageNote + + +# ============================================================================ +class PageIdTimestamp(BaseModel): + """Simplified model for page info to include in PageUrlCount""" + + pageId: UUID + ts: Optional[datetime] = None + status: int = 200 + + +# ============================================================================ +class PageUrlCount(BaseModel): + """Model for counting pages by URL""" + + url: AnyHttpUrl + count: int = 0 + snapshots: List[PageIdTimestamp] = [] + + +# ============================================================================ +class CrawlOutWithResources(CrawlOut): + """Crawl output model including resources""" + + resources: Optional[List[CrawlFileOut]] = [] + collections: Optional[List[CollIdName]] = [] + + initialPages: List[PageOut] = [] + totalPages: Optional[int] = None + pagesQueryUrl: str = "" + + # ============================================================================ ### COLLECTIONS ### @@ -1245,6 +1404,15 @@ class CollectionThumbnailSource(BaseModel): urlPageId: UUID +# ============================================================================ +class PreloadResource(BaseModel): + """Resources that will preloaded in RWP""" + + name: str + crawlId: str + hasPages: bool + + # ============================================================================ class Collection(BaseMongoModel): """Org collection structure""" @@ -1338,6 +1506,11 @@ class CollOut(BaseMongoModel): allowPublicDownload: bool = True + initialPages: List[PageOut] = [] + totalPages: Optional[int] = None + preloadResources: List[PreloadResource] = [] + pagesQueryUrl: str = "" + # ============================================================================ class PublicCollOut(BaseMongoModel): @@ -2435,161 +2608,6 @@ AnyJob = RootModel[ ] -# ============================================================================ - -### PAGES ### - - -# ============================================================================ -class PageReviewUpdate(BaseModel): - """Update model for page manual review/approval""" - - approved: Optional[bool] = None - - -# ============================================================================ -class PageNoteIn(BaseModel): - """Input model for adding page notes""" - - text: str - - -# ============================================================================ -class PageNoteEdit(BaseModel): - """Input model for editing page notes""" - - id: UUID - text: str - - -# ============================================================================ -class PageNoteDelete(BaseModel): - """Delete model for page notes""" - - delete_list: List[UUID] = [] - - -# ============================================================================ -class PageNote(BaseModel): - """Model for page notes, tracking user and time""" - - id: UUID - text: str - created: datetime - userid: UUID - userName: str - - -# ============================================================================ -class PageQACompare(BaseModel): - """Model for updating pages from QA run""" - - screenshotMatch: Optional[float] = None - textMatch: Optional[float] = None - resourceCounts: Optional[Dict[str, int]] = None - - -# ============================================================================ -class Page(BaseMongoModel): - """Core page data, no QA""" - - id: UUID - - oid: UUID - crawl_id: str - - # core page data - url: AnyHttpUrl - title: Optional[str] = None - ts: Optional[datetime] = None - loadState: Optional[int] = None - status: Optional[int] = None - mime: Optional[str] = None - filename: Optional[str] = None - depth: Optional[int] = None - favIconUrl: Optional[AnyHttpUrl] = None - isSeed: Optional[bool] = False - - # manual review - userid: Optional[UUID] = None - modified: Optional[datetime] = None - approved: Optional[bool] = None - notes: List[PageNote] = [] - - isFile: Optional[bool] = False - isError: Optional[bool] = False - - def compute_page_type(self): - """sets self.isFile or self.isError flags""" - self.isFile = False - self.isError = False - if self.loadState == 2: - # pylint: disable=unsupported-membership-test - if self.mime and "html" not in self.mime: - self.isFile = True - elif self.title is None and self.status == 200: - self.isFile = True - - elif self.loadState == 0: - self.isError = True - - -# ============================================================================ -class PageWithAllQA(Page): - """Model for core page data + qa""" - - # automated heuristics, keyed by QA run id - qa: Optional[Dict[str, PageQACompare]] = {} - - -# ============================================================================ -class PageOut(Page): - """Model for pages output, no QA""" - - status: int = 200 - - -# ============================================================================ -class PageOutWithSingleQA(Page): - """Page out with single QA entry""" - - qa: Optional[PageQACompare] = None - - -# ============================================================================ -class PageNoteAddedResponse(BaseModel): - """Model for response to adding page""" - - added: bool - data: PageNote - - -# ============================================================================ -class PageNoteUpdatedResponse(BaseModel): - """Model for response to updating page""" - - updated: bool - data: PageNote - - -# ============================================================================ -class PageIdTimestamp(BaseModel): - """Simplified model for page info to include in PageUrlCount""" - - pageId: UUID - ts: Optional[datetime] = None - status: int = 200 - - -# ============================================================================ -class PageUrlCount(BaseModel): - """Model for counting pages by URL""" - - url: AnyHttpUrl - count: int = 0 - snapshots: List[PageIdTimestamp] = [] - - # ============================================================================ ### GENERIC RESPONSE MODELS ### diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index ebabd297..d9e50174 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -11,7 +11,7 @@ from datetime import datetime from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union from uuid import UUID, uuid4 -from fastapi import Depends, HTTPException, Request +from fastapi import Depends, HTTPException, Request, Response import pymongo from .models import ( @@ -35,6 +35,7 @@ from .models import ( DeletedResponse, PageNoteAddedResponse, PageNoteUpdatedResponse, + EmptyResponse, ) from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .utils import str_to_date, str_list_to_bools, dt_now @@ -503,6 +504,7 @@ class PageOps: self, crawl_id: str, org: Optional[Organization] = None, + search: Optional[str] = None, url: Optional[str] = None, url_prefix: Optional[str] = None, ts: Optional[datetime] = None, @@ -534,6 +536,13 @@ class PageOps: if org: query["oid"] = org.id + if search: + search_regex = re.escape(urllib.parse.unquote(search)) + query["$or"] = [ + {"url": {"$regex": search_regex, "$options": "i"}}, + {"title": {"$regex": search_regex, "$options": "i"}}, + ] + if url_prefix: url_prefix = urllib.parse.unquote(url_prefix) regex_pattern = f"^{re.escape(url_prefix)}" @@ -661,6 +670,7 @@ class PageOps: self, coll_id: UUID, org: Optional[Organization] = None, + search: Optional[str] = None, url: Optional[str] = None, url_prefix: Optional[str] = None, ts: Optional[datetime] = None, @@ -670,6 +680,7 @@ class PageOps: page: int = 1, sort_by: Optional[str] = None, sort_direction: Optional[int] = -1, + public_or_unlisted_only=False, ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]: """List all pages in collection, with optional filtering""" # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements @@ -677,7 +688,9 @@ class PageOps: page = page - 1 skip = page_size * page - crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id) + crawl_ids = await self.coll_ops.get_collection_crawl_ids( + coll_id, public_or_unlisted_only + ) query: dict[str, object] = { "crawl_id": {"$in": crawl_ids}, @@ -685,7 +698,14 @@ class PageOps: if org: query["oid"] = org.id - if url_prefix: + if search: + search_regex = re.escape(urllib.parse.unquote(search)) + query["$or"] = [ + {"url": {"$regex": search_regex, "$options": "i"}}, + {"title": {"$regex": search_regex, "$options": "i"}}, + ] + + elif url_prefix: url_prefix = urllib.parse.unquote(url_prefix) regex_pattern = f"^{re.escape(url_prefix)}" query["url"] = {"$regex": regex_pattern, "$options": "i"} @@ -724,6 +744,9 @@ class PageOps: raise HTTPException(status_code=400, detail="invalid_sort_direction") aggregate.extend([{"$sort": {sort_by: sort_direction}}]) + else: + # default sort: seeds first, then by timestamp + aggregate.extend([{"$sort": {"isSeed": -1, "ts": 1}}]) aggregate.extend( [ @@ -886,6 +909,7 @@ def init_pages_api( org_viewer_dep = org_ops.org_viewer_dep org_crawl_dep = org_ops.org_crawl_dep + org_public = org_ops.org_public @app.post( "/orgs/{oid}/crawls/all/pages/reAdd", @@ -1056,6 +1080,7 @@ def init_pages_api( async def get_crawl_pages_list( crawl_id: str, org: Organization = Depends(org_crawl_dep), + search: Optional[str] = None, url: Optional[str] = None, urlPrefix: Optional[str] = None, ts: Optional[datetime] = None, @@ -1077,6 +1102,7 @@ def init_pages_api( pages, total = await ops.list_pages( crawl_id=crawl_id, org=org, + search=search, url=url, url_prefix=urlPrefix, ts=ts, @@ -1093,13 +1119,15 @@ def init_pages_api( return paginated_format(pages, total, page, pageSize) @app.get( - "/orgs/{oid}/collections/{coll_id}/pages", + "/orgs/{oid}/collections/{coll_id}/public/pages", tags=["pages", "collections"], response_model=PaginatedPageOutResponse, ) - async def get_collection_pages_list( + async def get_public_collection_pages_list( coll_id: UUID, - org: Organization = Depends(org_viewer_dep), + response: Response, + org: Organization = Depends(org_public), + search: Optional[str] = None, url: Optional[str] = None, urlPrefix: Optional[str] = None, ts: Optional[datetime] = None, @@ -1114,6 +1142,58 @@ def init_pages_api( pages, total = await ops.list_collection_pages( coll_id=coll_id, org=org, + search=search, + url=url, + url_prefix=urlPrefix, + ts=ts, + is_seed=isSeed, + depth=depth, + page_size=pageSize, + page=page, + sort_by=sortBy, + sort_direction=sortDirection, + public_or_unlisted_only=True, + ) + + response.headers["Access-Control-Allow-Origin"] = "*" + response.headers["Access-Control-Allow-Headers"] = "*" + return paginated_format(pages, total, page, pageSize) + + @app.options( + "/orgs/{oid}/collections/{coll_id}/public/pages", + tags=["pages", "collections"], + response_model=EmptyResponse, + ) + async def get_replay_preflight(response: Response): + response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS" + response.headers["Access-Control-Allow-Origin"] = "*" + response.headers["Access-Control-Allow-Headers"] = "*" + return {} + + @app.get( + "/orgs/{oid}/collections/{coll_id}/pages", + tags=["pages", "collections"], + response_model=PaginatedPageOutResponse, + ) + async def get_collection_pages_list( + coll_id: UUID, + org: Organization = Depends(org_viewer_dep), + search: Optional[str] = None, + url: Optional[str] = None, + urlPrefix: Optional[str] = None, + ts: Optional[datetime] = None, + isSeed: Optional[bool] = None, + depth: Optional[int] = None, + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sortBy: Optional[str] = None, + sortDirection: Optional[int] = -1, + ): + """Retrieve paginated list of pages in collection""" + pages, total = await ops.list_collection_pages( + coll_id=coll_id, + org=org, + search=search, url=url, url_prefix=urlPrefix, ts=ts, diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index bdeaae23..3bace9eb 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -382,6 +382,11 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id): assert data["dateEarliest"] assert data["dateLatest"] assert data["defaultThumbnailName"] + assert data["initialPages"] + assert data["pagesQueryUrl"].endswith( + f"/orgs/{default_org_id}/collections/{_coll_id}/pages" + ) + assert "preloadResources" in data resources = data["resources"] assert resources @@ -413,10 +418,27 @@ def test_collection_public(crawler_auth_headers, default_org_id): f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json", headers=crawler_auth_headers, ) + data = r.json() + assert data["initialPages"] + assert data["pagesQueryUrl"].endswith( + f"/orgs/{default_org_id}/collections/{_coll_id}/public/pages" + ) + assert "preloadResources" in data + assert r.status_code == 200 assert r.headers["Access-Control-Allow-Origin"] == "*" assert r.headers["Access-Control-Allow-Headers"] == "*" + # test public pages endpoint + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] > 0 + assert data["items"] + # make unlisted and test replay headers r = requests.patch( f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}", @@ -451,6 +473,12 @@ def test_collection_public(crawler_auth_headers, default_org_id): ) assert r.status_code == 404 + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages", + headers=crawler_auth_headers, + ) + assert r.status_code == 404 + def test_collection_access_invalid_value(crawler_auth_headers, default_org_id): r = requests.patch( @@ -614,6 +642,39 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id): coll_page_id = coll_page["id"] coll_page_url = coll_page["url"] coll_page_ts = coll_page["ts"] + coll_page_title = coll_page["title"] + + # Test search filter + partial_title = coll_page_title[:5] + partial_url = coll_page_url[:8] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + for matching_page in data["items"]: + assert ( + partial_title in matching_page["title"] + or partial_url in matching_page["url"] + ) + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_url}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + for matching_page in data["items"]: + assert ( + partial_title in matching_page["title"] + or partial_url in matching_page["url"] + ) # Test exact url filter r = requests.get( diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 51cec3cb..5e454d43 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -184,6 +184,11 @@ def test_wait_for_complete(admin_auth_headers, default_org_id): assert len(data["resources"]) == 1 assert data["resources"][0]["path"] + assert len(data["initialPages"]) == 1 + assert data["pagesQueryUrl"].endswith( + f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pages" + ) + # ensure filename matches specified pattern # set in default_crawl_filename_template assert re.search("/[\\d]+-testing-[\\w-]+\\.wacz", data["resources"][0]["path"])