Add initial pages + pagesQuery endpoint to /replay.json APIs (#2380)

Fixes #2360 

- Adds `initialPages` to /replay.json response for collections, returning
up-to 25 pages (seed pages first, then sorted by capture time).
- Adds `pagesQueryUrl` to /replay.json
- Adds a public pages search endpoint to support public collections.
- Adds `preloadResources`, including list of WACZ files that should
always be loaded, to /replay.json

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2025-02-13 16:53:47 -08:00 committed by GitHub
parent 73f9f949af
commit 7b2932c582
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 428 additions and 187 deletions

View File

@ -3,6 +3,7 @@
from datetime import datetime, timedelta
from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple
from uuid import UUID
import os
import urllib.parse
import asyncio
@ -31,7 +32,7 @@ from .models import (
PRESIGN_DURATION_SECONDS,
)
from .pagination import paginated_format, DEFAULT_PAGE_SIZE
from .utils import dt_now, date_to_str
from .utils import dt_now, date_to_str, get_origin
if TYPE_CHECKING:
from .crawlconfigs import CrawlConfigOps
@ -156,6 +157,7 @@ class BaseCrawlOps:
org: Optional[Organization] = None,
type_: Optional[str] = None,
skip_resources=False,
headers: Optional[dict] = None,
) -> CrawlOutWithResources:
"""Get crawl data for api output"""
res = await self.get_crawl_raw(crawlid, org, type_)
@ -168,6 +170,16 @@ class BaseCrawlOps:
if coll_ids:
res["collections"] = await self.colls.get_collection_names(coll_ids)
res["initialPages"], _ = await self.page_ops.list_pages(
crawlid, is_seed=True, page_size=25
)
oid = res.get("oid")
if oid:
res["pagesQueryUrl"] = (
get_origin(headers) + f"/api/orgs/{oid}/crawls/{crawlid}/pages"
)
crawl = CrawlOutWithResources.from_dict(res)
if not skip_resources:
@ -497,7 +509,7 @@ class BaseCrawlOps:
out_files.append(
CrawlFileOut(
name=file_.filename,
name=os.path.basename(file_.filename),
path=presigned_url or "",
hash=file_.hash,
size=file_.size,

View File

@ -53,16 +53,18 @@ from .models import (
ImageFilePreparer,
MIN_UPLOAD_PART_SIZE,
PublicCollOut,
PreloadResource,
)
from .utils import dt_now, slug_from_name, get_duplicate_key_error_field
from .utils import dt_now, slug_from_name, get_duplicate_key_error_field, get_origin
if TYPE_CHECKING:
from .orgs import OrgOps
from .storages import StorageOps
from .webhooks import EventWebhookOps
from .crawls import CrawlOps
from .pages import PageOps
else:
OrgOps = StorageOps = EventWebhookOps = CrawlOps = object
OrgOps = StorageOps = EventWebhookOps = CrawlOps = PageOps = object
THUMBNAIL_MAX_SIZE = 2_000_000
@ -78,6 +80,7 @@ class CollectionOps:
storage_ops: StorageOps
event_webhook_ops: EventWebhookOps
crawl_ops: CrawlOps
page_ops: PageOps
def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):
self.collections = mdb["collections"]
@ -337,12 +340,28 @@ class CollectionOps:
org: Organization,
resources=False,
public_or_unlisted_only=False,
headers: Optional[dict] = None,
) -> CollOut:
"""Get CollOut by id"""
# pylint: disable=too-many-locals
result = await self.get_collection_raw(coll_id, public_or_unlisted_only)
if resources:
result["resources"] = await self.get_collection_crawl_resources(coll_id)
result["resources"], result["preloadResources"] = (
await self.get_collection_crawl_resources(
coll_id, include_preloads=True
)
)
result["initialPages"], result["totalPages"] = (
await self.page_ops.list_collection_pages(coll_id, page_size=25)
)
public = "public/" if public_or_unlisted_only else ""
result["pagesQueryUrl"] = (
get_origin(headers)
+ f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages"
)
thumbnail = result.get("thumbnail")
if thumbnail:
@ -369,7 +388,7 @@ class CollectionOps:
if result.get("access") not in allowed_access:
raise HTTPException(status_code=404, detail="collection_not_found")
result["resources"] = await self.get_collection_crawl_resources(coll_id)
result["resources"], _ = await self.get_collection_crawl_resources(coll_id)
thumbnail = result.get("thumbnail")
if thumbnail:
@ -468,7 +487,11 @@ class CollectionOps:
collections: List[Union[CollOut, PublicCollOut]] = []
for res in items:
res["resources"] = await self.get_collection_crawl_resources(res["_id"])
res["resources"], res["preloadResources"] = (
await self.get_collection_crawl_resources(
res["_id"], include_preloads=not public_colls_out
)
)
thumbnail = res.get("thumbnail")
if thumbnail:
@ -490,12 +513,14 @@ class CollectionOps:
return collections, total
async def get_collection_crawl_resources(self, coll_id: UUID):
async def get_collection_crawl_resources(
self, coll_id: UUID, include_preloads=False
):
"""Return pre-signed resources for all collection crawl files."""
# Ensure collection exists
_ = await self.get_collection_raw(coll_id)
all_files = []
resources = []
crawls, _ = await self.crawl_ops.list_all_base_crawls(
collection_id=coll_id,
@ -506,9 +531,36 @@ class CollectionOps:
for crawl in crawls:
if crawl.resources:
all_files.extend(crawl.resources)
resources.extend(crawl.resources)
return all_files
preload_resources: List[PreloadResource] = []
if include_preloads:
no_page_items = await self.get_collection_resources_with_no_pages(crawls)
for item in no_page_items:
preload_resources.append(item)
return resources, preload_resources
async def get_collection_resources_with_no_pages(
self, crawls: List[CrawlOutWithResources]
) -> List[PreloadResource]:
"""Return wacz files in collection that have no pages"""
resources_no_pages: List[PreloadResource] = []
for crawl in crawls:
_, page_count = await self.page_ops.list_pages(crawl.id)
if page_count == 0 and crawl.resources:
for resource in crawl.resources:
resources_no_pages.append(
PreloadResource(
name=os.path.basename(resource.name),
crawlId=crawl.id,
hasPages=False,
)
)
return resources_no_pages
async def get_collection_names(self, uuids: List[UUID]):
"""return object of {_id, names} given list of collection ids"""
@ -528,9 +580,15 @@ class CollectionOps:
names = [name for name in names if name]
return {"names": names}
async def get_collection_crawl_ids(self, coll_id: UUID) -> List[str]:
"""Return list of crawl ids in collection"""
async def get_collection_crawl_ids(
self, coll_id: UUID, public_or_unlisted_only=False
) -> List[str]:
"""Return list of crawl ids in collection, including only public collections"""
crawl_ids = []
# ensure collection is public or unlisted, else throw here
if public_or_unlisted_only:
await self.get_collection_raw(coll_id, public_or_unlisted_only)
async for crawl_raw in self.crawls.find(
{"collectionIds": coll_id}, projection=["_id"]
):
@ -1010,8 +1068,8 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de
try:
all_collections, _ = await colls.list_collections(org, page_size=10_000)
for collection in all_collections:
results[collection.name] = await colls.get_collection_crawl_resources(
collection.id
results[collection.name], _ = (
await colls.get_collection_crawl_resources(collection.id)
)
except Exception as exc:
# pylint: disable=raise-missing-from
@ -1047,9 +1105,11 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de
response_model=CollOut,
)
async def get_collection_replay(
coll_id: UUID, org: Organization = Depends(org_viewer_dep)
request: Request, coll_id: UUID, org: Organization = Depends(org_viewer_dep)
):
return await colls.get_collection_out(coll_id, org, resources=True)
return await colls.get_collection_out(
coll_id, org, resources=True, headers=dict(request.headers)
)
@app.get(
"/orgs/{oid}/collections/{coll_id}/public/replay.json",
@ -1057,12 +1117,17 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de
response_model=CollOut,
)
async def get_collection_public_replay(
request: Request,
response: Response,
coll_id: UUID,
org: Organization = Depends(org_public),
):
coll = await colls.get_collection_out(
coll_id, org, resources=True, public_or_unlisted_only=True
coll_id,
org,
resources=True,
public_or_unlisted_only=True,
headers=dict(request.headers),
)
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Headers"] = "*"

View File

@ -881,14 +881,6 @@ class CrawlOut(BaseMongoModel):
errorPageCount: Optional[int] = 0
# ============================================================================
class CrawlOutWithResources(CrawlOut):
"""Crawl output model including resources"""
resources: Optional[List[CrawlFileOut]] = []
collections: Optional[List[CollIdName]] = []
# ============================================================================
class UpdateCrawl(BaseModel):
"""Update crawl"""
@ -1222,6 +1214,173 @@ class ImageFilePreparer(FilePreparer):
)
# ============================================================================
### PAGES ###
# ============================================================================
class PageReviewUpdate(BaseModel):
"""Update model for page manual review/approval"""
approved: Optional[bool] = None
# ============================================================================
class PageNoteIn(BaseModel):
"""Input model for adding page notes"""
text: str
# ============================================================================
class PageNoteEdit(BaseModel):
"""Input model for editing page notes"""
id: UUID
text: str
# ============================================================================
class PageNoteDelete(BaseModel):
"""Delete model for page notes"""
delete_list: List[UUID] = []
# ============================================================================
class PageNote(BaseModel):
"""Model for page notes, tracking user and time"""
id: UUID
text: str
created: datetime
userid: UUID
userName: str
# ============================================================================
class PageQACompare(BaseModel):
"""Model for updating pages from QA run"""
screenshotMatch: Optional[float] = None
textMatch: Optional[float] = None
resourceCounts: Optional[Dict[str, int]] = None
# ============================================================================
class Page(BaseMongoModel):
"""Core page data, no QA"""
id: UUID
oid: UUID
crawl_id: str
# core page data
url: AnyHttpUrl
title: Optional[str] = None
ts: Optional[datetime] = None
loadState: Optional[int] = None
status: Optional[int] = None
mime: Optional[str] = None
filename: Optional[str] = None
depth: Optional[int] = None
favIconUrl: Optional[AnyHttpUrl] = None
isSeed: Optional[bool] = False
# manual review
userid: Optional[UUID] = None
modified: Optional[datetime] = None
approved: Optional[bool] = None
notes: List[PageNote] = []
isFile: Optional[bool] = False
isError: Optional[bool] = False
def compute_page_type(self):
"""sets self.isFile or self.isError flags"""
self.isFile = False
self.isError = False
if self.loadState == 2:
# pylint: disable=unsupported-membership-test
if self.mime and "html" not in self.mime:
self.isFile = True
elif self.title is None and self.status == 200:
self.isFile = True
elif self.loadState == 0:
self.isError = True
# ============================================================================
class PageWithAllQA(Page):
"""Model for core page data + qa"""
# automated heuristics, keyed by QA run id
qa: Optional[Dict[str, PageQACompare]] = {}
# ============================================================================
class PageOut(Page):
"""Model for pages output, no QA"""
status: int = 200
# ============================================================================
class PageOutWithSingleQA(Page):
"""Page out with single QA entry"""
qa: Optional[PageQACompare] = None
# ============================================================================
class PageNoteAddedResponse(BaseModel):
"""Model for response to adding page"""
added: bool
data: PageNote
# ============================================================================
class PageNoteUpdatedResponse(BaseModel):
"""Model for response to updating page"""
updated: bool
data: PageNote
# ============================================================================
class PageIdTimestamp(BaseModel):
"""Simplified model for page info to include in PageUrlCount"""
pageId: UUID
ts: Optional[datetime] = None
status: int = 200
# ============================================================================
class PageUrlCount(BaseModel):
"""Model for counting pages by URL"""
url: AnyHttpUrl
count: int = 0
snapshots: List[PageIdTimestamp] = []
# ============================================================================
class CrawlOutWithResources(CrawlOut):
"""Crawl output model including resources"""
resources: Optional[List[CrawlFileOut]] = []
collections: Optional[List[CollIdName]] = []
initialPages: List[PageOut] = []
totalPages: Optional[int] = None
pagesQueryUrl: str = ""
# ============================================================================
### COLLECTIONS ###
@ -1245,6 +1404,15 @@ class CollectionThumbnailSource(BaseModel):
urlPageId: UUID
# ============================================================================
class PreloadResource(BaseModel):
"""Resources that will preloaded in RWP"""
name: str
crawlId: str
hasPages: bool
# ============================================================================
class Collection(BaseMongoModel):
"""Org collection structure"""
@ -1338,6 +1506,11 @@ class CollOut(BaseMongoModel):
allowPublicDownload: bool = True
initialPages: List[PageOut] = []
totalPages: Optional[int] = None
preloadResources: List[PreloadResource] = []
pagesQueryUrl: str = ""
# ============================================================================
class PublicCollOut(BaseMongoModel):
@ -2435,161 +2608,6 @@ AnyJob = RootModel[
]
# ============================================================================
### PAGES ###
# ============================================================================
class PageReviewUpdate(BaseModel):
"""Update model for page manual review/approval"""
approved: Optional[bool] = None
# ============================================================================
class PageNoteIn(BaseModel):
"""Input model for adding page notes"""
text: str
# ============================================================================
class PageNoteEdit(BaseModel):
"""Input model for editing page notes"""
id: UUID
text: str
# ============================================================================
class PageNoteDelete(BaseModel):
"""Delete model for page notes"""
delete_list: List[UUID] = []
# ============================================================================
class PageNote(BaseModel):
"""Model for page notes, tracking user and time"""
id: UUID
text: str
created: datetime
userid: UUID
userName: str
# ============================================================================
class PageQACompare(BaseModel):
"""Model for updating pages from QA run"""
screenshotMatch: Optional[float] = None
textMatch: Optional[float] = None
resourceCounts: Optional[Dict[str, int]] = None
# ============================================================================
class Page(BaseMongoModel):
"""Core page data, no QA"""
id: UUID
oid: UUID
crawl_id: str
# core page data
url: AnyHttpUrl
title: Optional[str] = None
ts: Optional[datetime] = None
loadState: Optional[int] = None
status: Optional[int] = None
mime: Optional[str] = None
filename: Optional[str] = None
depth: Optional[int] = None
favIconUrl: Optional[AnyHttpUrl] = None
isSeed: Optional[bool] = False
# manual review
userid: Optional[UUID] = None
modified: Optional[datetime] = None
approved: Optional[bool] = None
notes: List[PageNote] = []
isFile: Optional[bool] = False
isError: Optional[bool] = False
def compute_page_type(self):
"""sets self.isFile or self.isError flags"""
self.isFile = False
self.isError = False
if self.loadState == 2:
# pylint: disable=unsupported-membership-test
if self.mime and "html" not in self.mime:
self.isFile = True
elif self.title is None and self.status == 200:
self.isFile = True
elif self.loadState == 0:
self.isError = True
# ============================================================================
class PageWithAllQA(Page):
"""Model for core page data + qa"""
# automated heuristics, keyed by QA run id
qa: Optional[Dict[str, PageQACompare]] = {}
# ============================================================================
class PageOut(Page):
"""Model for pages output, no QA"""
status: int = 200
# ============================================================================
class PageOutWithSingleQA(Page):
"""Page out with single QA entry"""
qa: Optional[PageQACompare] = None
# ============================================================================
class PageNoteAddedResponse(BaseModel):
"""Model for response to adding page"""
added: bool
data: PageNote
# ============================================================================
class PageNoteUpdatedResponse(BaseModel):
"""Model for response to updating page"""
updated: bool
data: PageNote
# ============================================================================
class PageIdTimestamp(BaseModel):
"""Simplified model for page info to include in PageUrlCount"""
pageId: UUID
ts: Optional[datetime] = None
status: int = 200
# ============================================================================
class PageUrlCount(BaseModel):
"""Model for counting pages by URL"""
url: AnyHttpUrl
count: int = 0
snapshots: List[PageIdTimestamp] = []
# ============================================================================
### GENERIC RESPONSE MODELS ###

View File

@ -11,7 +11,7 @@ from datetime import datetime
from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
from uuid import UUID, uuid4
from fastapi import Depends, HTTPException, Request
from fastapi import Depends, HTTPException, Request, Response
import pymongo
from .models import (
@ -35,6 +35,7 @@ from .models import (
DeletedResponse,
PageNoteAddedResponse,
PageNoteUpdatedResponse,
EmptyResponse,
)
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
from .utils import str_to_date, str_list_to_bools, dt_now
@ -503,6 +504,7 @@ class PageOps:
self,
crawl_id: str,
org: Optional[Organization] = None,
search: Optional[str] = None,
url: Optional[str] = None,
url_prefix: Optional[str] = None,
ts: Optional[datetime] = None,
@ -534,6 +536,13 @@ class PageOps:
if org:
query["oid"] = org.id
if search:
search_regex = re.escape(urllib.parse.unquote(search))
query["$or"] = [
{"url": {"$regex": search_regex, "$options": "i"}},
{"title": {"$regex": search_regex, "$options": "i"}},
]
if url_prefix:
url_prefix = urllib.parse.unquote(url_prefix)
regex_pattern = f"^{re.escape(url_prefix)}"
@ -661,6 +670,7 @@ class PageOps:
self,
coll_id: UUID,
org: Optional[Organization] = None,
search: Optional[str] = None,
url: Optional[str] = None,
url_prefix: Optional[str] = None,
ts: Optional[datetime] = None,
@ -670,6 +680,7 @@ class PageOps:
page: int = 1,
sort_by: Optional[str] = None,
sort_direction: Optional[int] = -1,
public_or_unlisted_only=False,
) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]:
"""List all pages in collection, with optional filtering"""
# pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements
@ -677,7 +688,9 @@ class PageOps:
page = page - 1
skip = page_size * page
crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id)
crawl_ids = await self.coll_ops.get_collection_crawl_ids(
coll_id, public_or_unlisted_only
)
query: dict[str, object] = {
"crawl_id": {"$in": crawl_ids},
@ -685,7 +698,14 @@ class PageOps:
if org:
query["oid"] = org.id
if url_prefix:
if search:
search_regex = re.escape(urllib.parse.unquote(search))
query["$or"] = [
{"url": {"$regex": search_regex, "$options": "i"}},
{"title": {"$regex": search_regex, "$options": "i"}},
]
elif url_prefix:
url_prefix = urllib.parse.unquote(url_prefix)
regex_pattern = f"^{re.escape(url_prefix)}"
query["url"] = {"$regex": regex_pattern, "$options": "i"}
@ -724,6 +744,9 @@ class PageOps:
raise HTTPException(status_code=400, detail="invalid_sort_direction")
aggregate.extend([{"$sort": {sort_by: sort_direction}}])
else:
# default sort: seeds first, then by timestamp
aggregate.extend([{"$sort": {"isSeed": -1, "ts": 1}}])
aggregate.extend(
[
@ -886,6 +909,7 @@ def init_pages_api(
org_viewer_dep = org_ops.org_viewer_dep
org_crawl_dep = org_ops.org_crawl_dep
org_public = org_ops.org_public
@app.post(
"/orgs/{oid}/crawls/all/pages/reAdd",
@ -1056,6 +1080,7 @@ def init_pages_api(
async def get_crawl_pages_list(
crawl_id: str,
org: Organization = Depends(org_crawl_dep),
search: Optional[str] = None,
url: Optional[str] = None,
urlPrefix: Optional[str] = None,
ts: Optional[datetime] = None,
@ -1077,6 +1102,7 @@ def init_pages_api(
pages, total = await ops.list_pages(
crawl_id=crawl_id,
org=org,
search=search,
url=url,
url_prefix=urlPrefix,
ts=ts,
@ -1093,13 +1119,15 @@ def init_pages_api(
return paginated_format(pages, total, page, pageSize)
@app.get(
"/orgs/{oid}/collections/{coll_id}/pages",
"/orgs/{oid}/collections/{coll_id}/public/pages",
tags=["pages", "collections"],
response_model=PaginatedPageOutResponse,
)
async def get_collection_pages_list(
async def get_public_collection_pages_list(
coll_id: UUID,
org: Organization = Depends(org_viewer_dep),
response: Response,
org: Organization = Depends(org_public),
search: Optional[str] = None,
url: Optional[str] = None,
urlPrefix: Optional[str] = None,
ts: Optional[datetime] = None,
@ -1114,6 +1142,58 @@ def init_pages_api(
pages, total = await ops.list_collection_pages(
coll_id=coll_id,
org=org,
search=search,
url=url,
url_prefix=urlPrefix,
ts=ts,
is_seed=isSeed,
depth=depth,
page_size=pageSize,
page=page,
sort_by=sortBy,
sort_direction=sortDirection,
public_or_unlisted_only=True,
)
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Headers"] = "*"
return paginated_format(pages, total, page, pageSize)
@app.options(
"/orgs/{oid}/collections/{coll_id}/public/pages",
tags=["pages", "collections"],
response_model=EmptyResponse,
)
async def get_replay_preflight(response: Response):
response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS"
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Headers"] = "*"
return {}
@app.get(
"/orgs/{oid}/collections/{coll_id}/pages",
tags=["pages", "collections"],
response_model=PaginatedPageOutResponse,
)
async def get_collection_pages_list(
coll_id: UUID,
org: Organization = Depends(org_viewer_dep),
search: Optional[str] = None,
url: Optional[str] = None,
urlPrefix: Optional[str] = None,
ts: Optional[datetime] = None,
isSeed: Optional[bool] = None,
depth: Optional[int] = None,
pageSize: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sortBy: Optional[str] = None,
sortDirection: Optional[int] = -1,
):
"""Retrieve paginated list of pages in collection"""
pages, total = await ops.list_collection_pages(
coll_id=coll_id,
org=org,
search=search,
url=url,
url_prefix=urlPrefix,
ts=ts,

View File

@ -382,6 +382,11 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id):
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["defaultThumbnailName"]
assert data["initialPages"]
assert data["pagesQueryUrl"].endswith(
f"/orgs/{default_org_id}/collections/{_coll_id}/pages"
)
assert "preloadResources" in data
resources = data["resources"]
assert resources
@ -413,10 +418,27 @@ def test_collection_public(crawler_auth_headers, default_org_id):
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
assert data["initialPages"]
assert data["pagesQueryUrl"].endswith(
f"/orgs/{default_org_id}/collections/{_coll_id}/public/pages"
)
assert "preloadResources" in data
assert r.status_code == 200
assert r.headers["Access-Control-Allow-Origin"] == "*"
assert r.headers["Access-Control-Allow-Headers"] == "*"
# test public pages endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] > 0
assert data["items"]
# make unlisted and test replay headers
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
@ -451,6 +473,12 @@ def test_collection_public(crawler_auth_headers, default_org_id):
)
assert r.status_code == 404
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 404
def test_collection_access_invalid_value(crawler_auth_headers, default_org_id):
r = requests.patch(
@ -614,6 +642,39 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id):
coll_page_id = coll_page["id"]
coll_page_url = coll_page["url"]
coll_page_ts = coll_page["ts"]
coll_page_title = coll_page["title"]
# Test search filter
partial_title = coll_page_title[:5]
partial_url = coll_page_url[:8]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
for matching_page in data["items"]:
assert (
partial_title in matching_page["title"]
or partial_url in matching_page["url"]
)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_url}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
for matching_page in data["items"]:
assert (
partial_title in matching_page["title"]
or partial_url in matching_page["url"]
)
# Test exact url filter
r = requests.get(

View File

@ -184,6 +184,11 @@ def test_wait_for_complete(admin_auth_headers, default_org_id):
assert len(data["resources"]) == 1
assert data["resources"][0]["path"]
assert len(data["initialPages"]) == 1
assert data["pagesQueryUrl"].endswith(
f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pages"
)
# ensure filename matches specified pattern
# set in default_crawl_filename_template
assert re.search("/[\\d]+-testing-[\\w-]+\\.wacz", data["resources"][0]["path"])