Add initial pages + pagesQuery endpoint to /replay.json APIs (#2380)

Fixes #2360 

- Adds `initialPages` to /replay.json response for collections, returning
up-to 25 pages (seed pages first, then sorted by capture time).
- Adds `pagesQueryUrl` to /replay.json
- Adds a public pages search endpoint to support public collections.
- Adds `preloadResources`, including list of WACZ files that should
always be loaded, to /replay.json

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2025-02-13 16:53:47 -08:00 committed by GitHub
parent 73f9f949af
commit 7b2932c582
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 428 additions and 187 deletions

View File

@ -3,6 +3,7 @@
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple
from uuid import UUID from uuid import UUID
import os
import urllib.parse import urllib.parse
import asyncio import asyncio
@ -31,7 +32,7 @@ from .models import (
PRESIGN_DURATION_SECONDS, PRESIGN_DURATION_SECONDS,
) )
from .pagination import paginated_format, DEFAULT_PAGE_SIZE from .pagination import paginated_format, DEFAULT_PAGE_SIZE
from .utils import dt_now, date_to_str from .utils import dt_now, date_to_str, get_origin
if TYPE_CHECKING: if TYPE_CHECKING:
from .crawlconfigs import CrawlConfigOps from .crawlconfigs import CrawlConfigOps
@ -156,6 +157,7 @@ class BaseCrawlOps:
org: Optional[Organization] = None, org: Optional[Organization] = None,
type_: Optional[str] = None, type_: Optional[str] = None,
skip_resources=False, skip_resources=False,
headers: Optional[dict] = None,
) -> CrawlOutWithResources: ) -> CrawlOutWithResources:
"""Get crawl data for api output""" """Get crawl data for api output"""
res = await self.get_crawl_raw(crawlid, org, type_) res = await self.get_crawl_raw(crawlid, org, type_)
@ -168,6 +170,16 @@ class BaseCrawlOps:
if coll_ids: if coll_ids:
res["collections"] = await self.colls.get_collection_names(coll_ids) res["collections"] = await self.colls.get_collection_names(coll_ids)
res["initialPages"], _ = await self.page_ops.list_pages(
crawlid, is_seed=True, page_size=25
)
oid = res.get("oid")
if oid:
res["pagesQueryUrl"] = (
get_origin(headers) + f"/api/orgs/{oid}/crawls/{crawlid}/pages"
)
crawl = CrawlOutWithResources.from_dict(res) crawl = CrawlOutWithResources.from_dict(res)
if not skip_resources: if not skip_resources:
@ -497,7 +509,7 @@ class BaseCrawlOps:
out_files.append( out_files.append(
CrawlFileOut( CrawlFileOut(
name=file_.filename, name=os.path.basename(file_.filename),
path=presigned_url or "", path=presigned_url or "",
hash=file_.hash, hash=file_.hash,
size=file_.size, size=file_.size,

View File

@ -53,16 +53,18 @@ from .models import (
ImageFilePreparer, ImageFilePreparer,
MIN_UPLOAD_PART_SIZE, MIN_UPLOAD_PART_SIZE,
PublicCollOut, PublicCollOut,
PreloadResource,
) )
from .utils import dt_now, slug_from_name, get_duplicate_key_error_field from .utils import dt_now, slug_from_name, get_duplicate_key_error_field, get_origin
if TYPE_CHECKING: if TYPE_CHECKING:
from .orgs import OrgOps from .orgs import OrgOps
from .storages import StorageOps from .storages import StorageOps
from .webhooks import EventWebhookOps from .webhooks import EventWebhookOps
from .crawls import CrawlOps from .crawls import CrawlOps
from .pages import PageOps
else: else:
OrgOps = StorageOps = EventWebhookOps = CrawlOps = object OrgOps = StorageOps = EventWebhookOps = CrawlOps = PageOps = object
THUMBNAIL_MAX_SIZE = 2_000_000 THUMBNAIL_MAX_SIZE = 2_000_000
@ -78,6 +80,7 @@ class CollectionOps:
storage_ops: StorageOps storage_ops: StorageOps
event_webhook_ops: EventWebhookOps event_webhook_ops: EventWebhookOps
crawl_ops: CrawlOps crawl_ops: CrawlOps
page_ops: PageOps
def __init__(self, mdb, storage_ops, orgs, event_webhook_ops): def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):
self.collections = mdb["collections"] self.collections = mdb["collections"]
@ -337,12 +340,28 @@ class CollectionOps:
org: Organization, org: Organization,
resources=False, resources=False,
public_or_unlisted_only=False, public_or_unlisted_only=False,
headers: Optional[dict] = None,
) -> CollOut: ) -> CollOut:
"""Get CollOut by id""" """Get CollOut by id"""
# pylint: disable=too-many-locals
result = await self.get_collection_raw(coll_id, public_or_unlisted_only) result = await self.get_collection_raw(coll_id, public_or_unlisted_only)
if resources: if resources:
result["resources"] = await self.get_collection_crawl_resources(coll_id) result["resources"], result["preloadResources"] = (
await self.get_collection_crawl_resources(
coll_id, include_preloads=True
)
)
result["initialPages"], result["totalPages"] = (
await self.page_ops.list_collection_pages(coll_id, page_size=25)
)
public = "public/" if public_or_unlisted_only else ""
result["pagesQueryUrl"] = (
get_origin(headers)
+ f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages"
)
thumbnail = result.get("thumbnail") thumbnail = result.get("thumbnail")
if thumbnail: if thumbnail:
@ -369,7 +388,7 @@ class CollectionOps:
if result.get("access") not in allowed_access: if result.get("access") not in allowed_access:
raise HTTPException(status_code=404, detail="collection_not_found") raise HTTPException(status_code=404, detail="collection_not_found")
result["resources"] = await self.get_collection_crawl_resources(coll_id) result["resources"], _ = await self.get_collection_crawl_resources(coll_id)
thumbnail = result.get("thumbnail") thumbnail = result.get("thumbnail")
if thumbnail: if thumbnail:
@ -468,7 +487,11 @@ class CollectionOps:
collections: List[Union[CollOut, PublicCollOut]] = [] collections: List[Union[CollOut, PublicCollOut]] = []
for res in items: for res in items:
res["resources"] = await self.get_collection_crawl_resources(res["_id"]) res["resources"], res["preloadResources"] = (
await self.get_collection_crawl_resources(
res["_id"], include_preloads=not public_colls_out
)
)
thumbnail = res.get("thumbnail") thumbnail = res.get("thumbnail")
if thumbnail: if thumbnail:
@ -490,12 +513,14 @@ class CollectionOps:
return collections, total return collections, total
async def get_collection_crawl_resources(self, coll_id: UUID): async def get_collection_crawl_resources(
self, coll_id: UUID, include_preloads=False
):
"""Return pre-signed resources for all collection crawl files.""" """Return pre-signed resources for all collection crawl files."""
# Ensure collection exists # Ensure collection exists
_ = await self.get_collection_raw(coll_id) _ = await self.get_collection_raw(coll_id)
all_files = [] resources = []
crawls, _ = await self.crawl_ops.list_all_base_crawls( crawls, _ = await self.crawl_ops.list_all_base_crawls(
collection_id=coll_id, collection_id=coll_id,
@ -506,9 +531,36 @@ class CollectionOps:
for crawl in crawls: for crawl in crawls:
if crawl.resources: if crawl.resources:
all_files.extend(crawl.resources) resources.extend(crawl.resources)
return all_files preload_resources: List[PreloadResource] = []
if include_preloads:
no_page_items = await self.get_collection_resources_with_no_pages(crawls)
for item in no_page_items:
preload_resources.append(item)
return resources, preload_resources
async def get_collection_resources_with_no_pages(
self, crawls: List[CrawlOutWithResources]
) -> List[PreloadResource]:
"""Return wacz files in collection that have no pages"""
resources_no_pages: List[PreloadResource] = []
for crawl in crawls:
_, page_count = await self.page_ops.list_pages(crawl.id)
if page_count == 0 and crawl.resources:
for resource in crawl.resources:
resources_no_pages.append(
PreloadResource(
name=os.path.basename(resource.name),
crawlId=crawl.id,
hasPages=False,
)
)
return resources_no_pages
async def get_collection_names(self, uuids: List[UUID]): async def get_collection_names(self, uuids: List[UUID]):
"""return object of {_id, names} given list of collection ids""" """return object of {_id, names} given list of collection ids"""
@ -528,9 +580,15 @@ class CollectionOps:
names = [name for name in names if name] names = [name for name in names if name]
return {"names": names} return {"names": names}
async def get_collection_crawl_ids(self, coll_id: UUID) -> List[str]: async def get_collection_crawl_ids(
"""Return list of crawl ids in collection""" self, coll_id: UUID, public_or_unlisted_only=False
) -> List[str]:
"""Return list of crawl ids in collection, including only public collections"""
crawl_ids = [] crawl_ids = []
# ensure collection is public or unlisted, else throw here
if public_or_unlisted_only:
await self.get_collection_raw(coll_id, public_or_unlisted_only)
async for crawl_raw in self.crawls.find( async for crawl_raw in self.crawls.find(
{"collectionIds": coll_id}, projection=["_id"] {"collectionIds": coll_id}, projection=["_id"]
): ):
@ -1010,8 +1068,8 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de
try: try:
all_collections, _ = await colls.list_collections(org, page_size=10_000) all_collections, _ = await colls.list_collections(org, page_size=10_000)
for collection in all_collections: for collection in all_collections:
results[collection.name] = await colls.get_collection_crawl_resources( results[collection.name], _ = (
collection.id await colls.get_collection_crawl_resources(collection.id)
) )
except Exception as exc: except Exception as exc:
# pylint: disable=raise-missing-from # pylint: disable=raise-missing-from
@ -1047,9 +1105,11 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de
response_model=CollOut, response_model=CollOut,
) )
async def get_collection_replay( async def get_collection_replay(
coll_id: UUID, org: Organization = Depends(org_viewer_dep) request: Request, coll_id: UUID, org: Organization = Depends(org_viewer_dep)
): ):
return await colls.get_collection_out(coll_id, org, resources=True) return await colls.get_collection_out(
coll_id, org, resources=True, headers=dict(request.headers)
)
@app.get( @app.get(
"/orgs/{oid}/collections/{coll_id}/public/replay.json", "/orgs/{oid}/collections/{coll_id}/public/replay.json",
@ -1057,12 +1117,17 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de
response_model=CollOut, response_model=CollOut,
) )
async def get_collection_public_replay( async def get_collection_public_replay(
request: Request,
response: Response, response: Response,
coll_id: UUID, coll_id: UUID,
org: Organization = Depends(org_public), org: Organization = Depends(org_public),
): ):
coll = await colls.get_collection_out( coll = await colls.get_collection_out(
coll_id, org, resources=True, public_or_unlisted_only=True coll_id,
org,
resources=True,
public_or_unlisted_only=True,
headers=dict(request.headers),
) )
response.headers["Access-Control-Allow-Origin"] = "*" response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Headers"] = "*" response.headers["Access-Control-Allow-Headers"] = "*"

View File

@ -881,14 +881,6 @@ class CrawlOut(BaseMongoModel):
errorPageCount: Optional[int] = 0 errorPageCount: Optional[int] = 0
# ============================================================================
class CrawlOutWithResources(CrawlOut):
"""Crawl output model including resources"""
resources: Optional[List[CrawlFileOut]] = []
collections: Optional[List[CollIdName]] = []
# ============================================================================ # ============================================================================
class UpdateCrawl(BaseModel): class UpdateCrawl(BaseModel):
"""Update crawl""" """Update crawl"""
@ -1222,6 +1214,173 @@ class ImageFilePreparer(FilePreparer):
) )
# ============================================================================
### PAGES ###
# ============================================================================
class PageReviewUpdate(BaseModel):
"""Update model for page manual review/approval"""
approved: Optional[bool] = None
# ============================================================================
class PageNoteIn(BaseModel):
"""Input model for adding page notes"""
text: str
# ============================================================================
class PageNoteEdit(BaseModel):
"""Input model for editing page notes"""
id: UUID
text: str
# ============================================================================
class PageNoteDelete(BaseModel):
"""Delete model for page notes"""
delete_list: List[UUID] = []
# ============================================================================
class PageNote(BaseModel):
"""Model for page notes, tracking user and time"""
id: UUID
text: str
created: datetime
userid: UUID
userName: str
# ============================================================================
class PageQACompare(BaseModel):
"""Model for updating pages from QA run"""
screenshotMatch: Optional[float] = None
textMatch: Optional[float] = None
resourceCounts: Optional[Dict[str, int]] = None
# ============================================================================
class Page(BaseMongoModel):
"""Core page data, no QA"""
id: UUID
oid: UUID
crawl_id: str
# core page data
url: AnyHttpUrl
title: Optional[str] = None
ts: Optional[datetime] = None
loadState: Optional[int] = None
status: Optional[int] = None
mime: Optional[str] = None
filename: Optional[str] = None
depth: Optional[int] = None
favIconUrl: Optional[AnyHttpUrl] = None
isSeed: Optional[bool] = False
# manual review
userid: Optional[UUID] = None
modified: Optional[datetime] = None
approved: Optional[bool] = None
notes: List[PageNote] = []
isFile: Optional[bool] = False
isError: Optional[bool] = False
def compute_page_type(self):
"""sets self.isFile or self.isError flags"""
self.isFile = False
self.isError = False
if self.loadState == 2:
# pylint: disable=unsupported-membership-test
if self.mime and "html" not in self.mime:
self.isFile = True
elif self.title is None and self.status == 200:
self.isFile = True
elif self.loadState == 0:
self.isError = True
# ============================================================================
class PageWithAllQA(Page):
"""Model for core page data + qa"""
# automated heuristics, keyed by QA run id
qa: Optional[Dict[str, PageQACompare]] = {}
# ============================================================================
class PageOut(Page):
"""Model for pages output, no QA"""
status: int = 200
# ============================================================================
class PageOutWithSingleQA(Page):
"""Page out with single QA entry"""
qa: Optional[PageQACompare] = None
# ============================================================================
class PageNoteAddedResponse(BaseModel):
"""Model for response to adding page"""
added: bool
data: PageNote
# ============================================================================
class PageNoteUpdatedResponse(BaseModel):
"""Model for response to updating page"""
updated: bool
data: PageNote
# ============================================================================
class PageIdTimestamp(BaseModel):
"""Simplified model for page info to include in PageUrlCount"""
pageId: UUID
ts: Optional[datetime] = None
status: int = 200
# ============================================================================
class PageUrlCount(BaseModel):
"""Model for counting pages by URL"""
url: AnyHttpUrl
count: int = 0
snapshots: List[PageIdTimestamp] = []
# ============================================================================
class CrawlOutWithResources(CrawlOut):
"""Crawl output model including resources"""
resources: Optional[List[CrawlFileOut]] = []
collections: Optional[List[CollIdName]] = []
initialPages: List[PageOut] = []
totalPages: Optional[int] = None
pagesQueryUrl: str = ""
# ============================================================================ # ============================================================================
### COLLECTIONS ### ### COLLECTIONS ###
@ -1245,6 +1404,15 @@ class CollectionThumbnailSource(BaseModel):
urlPageId: UUID urlPageId: UUID
# ============================================================================
class PreloadResource(BaseModel):
"""Resources that will preloaded in RWP"""
name: str
crawlId: str
hasPages: bool
# ============================================================================ # ============================================================================
class Collection(BaseMongoModel): class Collection(BaseMongoModel):
"""Org collection structure""" """Org collection structure"""
@ -1338,6 +1506,11 @@ class CollOut(BaseMongoModel):
allowPublicDownload: bool = True allowPublicDownload: bool = True
initialPages: List[PageOut] = []
totalPages: Optional[int] = None
preloadResources: List[PreloadResource] = []
pagesQueryUrl: str = ""
# ============================================================================ # ============================================================================
class PublicCollOut(BaseMongoModel): class PublicCollOut(BaseMongoModel):
@ -2435,161 +2608,6 @@ AnyJob = RootModel[
] ]
# ============================================================================
### PAGES ###
# ============================================================================
class PageReviewUpdate(BaseModel):
"""Update model for page manual review/approval"""
approved: Optional[bool] = None
# ============================================================================
class PageNoteIn(BaseModel):
"""Input model for adding page notes"""
text: str
# ============================================================================
class PageNoteEdit(BaseModel):
"""Input model for editing page notes"""
id: UUID
text: str
# ============================================================================
class PageNoteDelete(BaseModel):
"""Delete model for page notes"""
delete_list: List[UUID] = []
# ============================================================================
class PageNote(BaseModel):
"""Model for page notes, tracking user and time"""
id: UUID
text: str
created: datetime
userid: UUID
userName: str
# ============================================================================
class PageQACompare(BaseModel):
"""Model for updating pages from QA run"""
screenshotMatch: Optional[float] = None
textMatch: Optional[float] = None
resourceCounts: Optional[Dict[str, int]] = None
# ============================================================================
class Page(BaseMongoModel):
"""Core page data, no QA"""
id: UUID
oid: UUID
crawl_id: str
# core page data
url: AnyHttpUrl
title: Optional[str] = None
ts: Optional[datetime] = None
loadState: Optional[int] = None
status: Optional[int] = None
mime: Optional[str] = None
filename: Optional[str] = None
depth: Optional[int] = None
favIconUrl: Optional[AnyHttpUrl] = None
isSeed: Optional[bool] = False
# manual review
userid: Optional[UUID] = None
modified: Optional[datetime] = None
approved: Optional[bool] = None
notes: List[PageNote] = []
isFile: Optional[bool] = False
isError: Optional[bool] = False
def compute_page_type(self):
"""sets self.isFile or self.isError flags"""
self.isFile = False
self.isError = False
if self.loadState == 2:
# pylint: disable=unsupported-membership-test
if self.mime and "html" not in self.mime:
self.isFile = True
elif self.title is None and self.status == 200:
self.isFile = True
elif self.loadState == 0:
self.isError = True
# ============================================================================
class PageWithAllQA(Page):
"""Model for core page data + qa"""
# automated heuristics, keyed by QA run id
qa: Optional[Dict[str, PageQACompare]] = {}
# ============================================================================
class PageOut(Page):
"""Model for pages output, no QA"""
status: int = 200
# ============================================================================
class PageOutWithSingleQA(Page):
"""Page out with single QA entry"""
qa: Optional[PageQACompare] = None
# ============================================================================
class PageNoteAddedResponse(BaseModel):
"""Model for response to adding page"""
added: bool
data: PageNote
# ============================================================================
class PageNoteUpdatedResponse(BaseModel):
"""Model for response to updating page"""
updated: bool
data: PageNote
# ============================================================================
class PageIdTimestamp(BaseModel):
"""Simplified model for page info to include in PageUrlCount"""
pageId: UUID
ts: Optional[datetime] = None
status: int = 200
# ============================================================================
class PageUrlCount(BaseModel):
"""Model for counting pages by URL"""
url: AnyHttpUrl
count: int = 0
snapshots: List[PageIdTimestamp] = []
# ============================================================================ # ============================================================================
### GENERIC RESPONSE MODELS ### ### GENERIC RESPONSE MODELS ###

View File

@ -11,7 +11,7 @@ from datetime import datetime
from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
from uuid import UUID, uuid4 from uuid import UUID, uuid4
from fastapi import Depends, HTTPException, Request from fastapi import Depends, HTTPException, Request, Response
import pymongo import pymongo
from .models import ( from .models import (
@ -35,6 +35,7 @@ from .models import (
DeletedResponse, DeletedResponse,
PageNoteAddedResponse, PageNoteAddedResponse,
PageNoteUpdatedResponse, PageNoteUpdatedResponse,
EmptyResponse,
) )
from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .pagination import DEFAULT_PAGE_SIZE, paginated_format
from .utils import str_to_date, str_list_to_bools, dt_now from .utils import str_to_date, str_list_to_bools, dt_now
@ -503,6 +504,7 @@ class PageOps:
self, self,
crawl_id: str, crawl_id: str,
org: Optional[Organization] = None, org: Optional[Organization] = None,
search: Optional[str] = None,
url: Optional[str] = None, url: Optional[str] = None,
url_prefix: Optional[str] = None, url_prefix: Optional[str] = None,
ts: Optional[datetime] = None, ts: Optional[datetime] = None,
@ -534,6 +536,13 @@ class PageOps:
if org: if org:
query["oid"] = org.id query["oid"] = org.id
if search:
search_regex = re.escape(urllib.parse.unquote(search))
query["$or"] = [
{"url": {"$regex": search_regex, "$options": "i"}},
{"title": {"$regex": search_regex, "$options": "i"}},
]
if url_prefix: if url_prefix:
url_prefix = urllib.parse.unquote(url_prefix) url_prefix = urllib.parse.unquote(url_prefix)
regex_pattern = f"^{re.escape(url_prefix)}" regex_pattern = f"^{re.escape(url_prefix)}"
@ -661,6 +670,7 @@ class PageOps:
self, self,
coll_id: UUID, coll_id: UUID,
org: Optional[Organization] = None, org: Optional[Organization] = None,
search: Optional[str] = None,
url: Optional[str] = None, url: Optional[str] = None,
url_prefix: Optional[str] = None, url_prefix: Optional[str] = None,
ts: Optional[datetime] = None, ts: Optional[datetime] = None,
@ -670,6 +680,7 @@ class PageOps:
page: int = 1, page: int = 1,
sort_by: Optional[str] = None, sort_by: Optional[str] = None,
sort_direction: Optional[int] = -1, sort_direction: Optional[int] = -1,
public_or_unlisted_only=False,
) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]: ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]:
"""List all pages in collection, with optional filtering""" """List all pages in collection, with optional filtering"""
# pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements
@ -677,7 +688,9 @@ class PageOps:
page = page - 1 page = page - 1
skip = page_size * page skip = page_size * page
crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id) crawl_ids = await self.coll_ops.get_collection_crawl_ids(
coll_id, public_or_unlisted_only
)
query: dict[str, object] = { query: dict[str, object] = {
"crawl_id": {"$in": crawl_ids}, "crawl_id": {"$in": crawl_ids},
@ -685,7 +698,14 @@ class PageOps:
if org: if org:
query["oid"] = org.id query["oid"] = org.id
if url_prefix: if search:
search_regex = re.escape(urllib.parse.unquote(search))
query["$or"] = [
{"url": {"$regex": search_regex, "$options": "i"}},
{"title": {"$regex": search_regex, "$options": "i"}},
]
elif url_prefix:
url_prefix = urllib.parse.unquote(url_prefix) url_prefix = urllib.parse.unquote(url_prefix)
regex_pattern = f"^{re.escape(url_prefix)}" regex_pattern = f"^{re.escape(url_prefix)}"
query["url"] = {"$regex": regex_pattern, "$options": "i"} query["url"] = {"$regex": regex_pattern, "$options": "i"}
@ -724,6 +744,9 @@ class PageOps:
raise HTTPException(status_code=400, detail="invalid_sort_direction") raise HTTPException(status_code=400, detail="invalid_sort_direction")
aggregate.extend([{"$sort": {sort_by: sort_direction}}]) aggregate.extend([{"$sort": {sort_by: sort_direction}}])
else:
# default sort: seeds first, then by timestamp
aggregate.extend([{"$sort": {"isSeed": -1, "ts": 1}}])
aggregate.extend( aggregate.extend(
[ [
@ -886,6 +909,7 @@ def init_pages_api(
org_viewer_dep = org_ops.org_viewer_dep org_viewer_dep = org_ops.org_viewer_dep
org_crawl_dep = org_ops.org_crawl_dep org_crawl_dep = org_ops.org_crawl_dep
org_public = org_ops.org_public
@app.post( @app.post(
"/orgs/{oid}/crawls/all/pages/reAdd", "/orgs/{oid}/crawls/all/pages/reAdd",
@ -1056,6 +1080,7 @@ def init_pages_api(
async def get_crawl_pages_list( async def get_crawl_pages_list(
crawl_id: str, crawl_id: str,
org: Organization = Depends(org_crawl_dep), org: Organization = Depends(org_crawl_dep),
search: Optional[str] = None,
url: Optional[str] = None, url: Optional[str] = None,
urlPrefix: Optional[str] = None, urlPrefix: Optional[str] = None,
ts: Optional[datetime] = None, ts: Optional[datetime] = None,
@ -1077,6 +1102,7 @@ def init_pages_api(
pages, total = await ops.list_pages( pages, total = await ops.list_pages(
crawl_id=crawl_id, crawl_id=crawl_id,
org=org, org=org,
search=search,
url=url, url=url,
url_prefix=urlPrefix, url_prefix=urlPrefix,
ts=ts, ts=ts,
@ -1093,13 +1119,15 @@ def init_pages_api(
return paginated_format(pages, total, page, pageSize) return paginated_format(pages, total, page, pageSize)
@app.get( @app.get(
"/orgs/{oid}/collections/{coll_id}/pages", "/orgs/{oid}/collections/{coll_id}/public/pages",
tags=["pages", "collections"], tags=["pages", "collections"],
response_model=PaginatedPageOutResponse, response_model=PaginatedPageOutResponse,
) )
async def get_collection_pages_list( async def get_public_collection_pages_list(
coll_id: UUID, coll_id: UUID,
org: Organization = Depends(org_viewer_dep), response: Response,
org: Organization = Depends(org_public),
search: Optional[str] = None,
url: Optional[str] = None, url: Optional[str] = None,
urlPrefix: Optional[str] = None, urlPrefix: Optional[str] = None,
ts: Optional[datetime] = None, ts: Optional[datetime] = None,
@ -1114,6 +1142,58 @@ def init_pages_api(
pages, total = await ops.list_collection_pages( pages, total = await ops.list_collection_pages(
coll_id=coll_id, coll_id=coll_id,
org=org, org=org,
search=search,
url=url,
url_prefix=urlPrefix,
ts=ts,
is_seed=isSeed,
depth=depth,
page_size=pageSize,
page=page,
sort_by=sortBy,
sort_direction=sortDirection,
public_or_unlisted_only=True,
)
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Headers"] = "*"
return paginated_format(pages, total, page, pageSize)
@app.options(
"/orgs/{oid}/collections/{coll_id}/public/pages",
tags=["pages", "collections"],
response_model=EmptyResponse,
)
async def get_replay_preflight(response: Response):
response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS"
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Headers"] = "*"
return {}
@app.get(
"/orgs/{oid}/collections/{coll_id}/pages",
tags=["pages", "collections"],
response_model=PaginatedPageOutResponse,
)
async def get_collection_pages_list(
coll_id: UUID,
org: Organization = Depends(org_viewer_dep),
search: Optional[str] = None,
url: Optional[str] = None,
urlPrefix: Optional[str] = None,
ts: Optional[datetime] = None,
isSeed: Optional[bool] = None,
depth: Optional[int] = None,
pageSize: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sortBy: Optional[str] = None,
sortDirection: Optional[int] = -1,
):
"""Retrieve paginated list of pages in collection"""
pages, total = await ops.list_collection_pages(
coll_id=coll_id,
org=org,
search=search,
url=url, url=url,
url_prefix=urlPrefix, url_prefix=urlPrefix,
ts=ts, ts=ts,

View File

@ -382,6 +382,11 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id):
assert data["dateEarliest"] assert data["dateEarliest"]
assert data["dateLatest"] assert data["dateLatest"]
assert data["defaultThumbnailName"] assert data["defaultThumbnailName"]
assert data["initialPages"]
assert data["pagesQueryUrl"].endswith(
f"/orgs/{default_org_id}/collections/{_coll_id}/pages"
)
assert "preloadResources" in data
resources = data["resources"] resources = data["resources"]
assert resources assert resources
@ -413,10 +418,27 @@ def test_collection_public(crawler_auth_headers, default_org_id):
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json", f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
headers=crawler_auth_headers, headers=crawler_auth_headers,
) )
data = r.json()
assert data["initialPages"]
assert data["pagesQueryUrl"].endswith(
f"/orgs/{default_org_id}/collections/{_coll_id}/public/pages"
)
assert "preloadResources" in data
assert r.status_code == 200 assert r.status_code == 200
assert r.headers["Access-Control-Allow-Origin"] == "*" assert r.headers["Access-Control-Allow-Origin"] == "*"
assert r.headers["Access-Control-Allow-Headers"] == "*" assert r.headers["Access-Control-Allow-Headers"] == "*"
# test public pages endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] > 0
assert data["items"]
# make unlisted and test replay headers # make unlisted and test replay headers
r = requests.patch( r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}", f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
@ -451,6 +473,12 @@ def test_collection_public(crawler_auth_headers, default_org_id):
) )
assert r.status_code == 404 assert r.status_code == 404
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 404
def test_collection_access_invalid_value(crawler_auth_headers, default_org_id): def test_collection_access_invalid_value(crawler_auth_headers, default_org_id):
r = requests.patch( r = requests.patch(
@ -614,6 +642,39 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id):
coll_page_id = coll_page["id"] coll_page_id = coll_page["id"]
coll_page_url = coll_page["url"] coll_page_url = coll_page["url"]
coll_page_ts = coll_page["ts"] coll_page_ts = coll_page["ts"]
coll_page_title = coll_page["title"]
# Test search filter
partial_title = coll_page_title[:5]
partial_url = coll_page_url[:8]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
for matching_page in data["items"]:
assert (
partial_title in matching_page["title"]
or partial_url in matching_page["url"]
)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_url}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
for matching_page in data["items"]:
assert (
partial_title in matching_page["title"]
or partial_url in matching_page["url"]
)
# Test exact url filter # Test exact url filter
r = requests.get( r = requests.get(

View File

@ -184,6 +184,11 @@ def test_wait_for_complete(admin_auth_headers, default_org_id):
assert len(data["resources"]) == 1 assert len(data["resources"]) == 1
assert data["resources"][0]["path"] assert data["resources"][0]["path"]
assert len(data["initialPages"]) == 1
assert data["pagesQueryUrl"].endswith(
f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pages"
)
# ensure filename matches specified pattern # ensure filename matches specified pattern
# set in default_crawl_filename_template # set in default_crawl_filename_template
assert re.search("/[\\d]+-testing-[\\w-]+\\.wacz", data["resources"][0]["path"]) assert re.search("/[\\d]+-testing-[\\w-]+\\.wacz", data["resources"][0]["path"])