Add initial pages + pagesQuery endpoint to /replay.json APIs (#2380)

Fixes #2360 - Adds `initialPages` to /replay.json response for collections, returning up-to 25 pages (seed pages first, then sorted by capture time). - Adds `pagesQueryUrl` to /replay.json - Adds a public pages search endpoint to support public collections. - Adds `preloadResources`, including list of WACZ files that should always be loaded, to /replay.json --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2025-02-13 16:53:47 -08:00 · 2025-02-13 16:53:47 -08:00 · 7b2932c582
commit 7b2932c582
parent 73f9f949af
6 changed files with 428 additions and 187 deletions
--- a/backend/btrixcloud/basecrawls.py
+++ b/backend/btrixcloud/basecrawls.py
@ -3,6 +3,7 @@
 from datetime import datetime, timedelta
 from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple
 from uuid import UUID
+import os
 import urllib.parse

 import asyncio
@ -31,7 +32,7 @@ from .models import (
    PRESIGN_DURATION_SECONDS,
 )
 from .pagination import paginated_format, DEFAULT_PAGE_SIZE
-from .utils import dt_now, date_to_str
+from .utils import dt_now, date_to_str, get_origin

 if TYPE_CHECKING:
    from .crawlconfigs import CrawlConfigOps
@ -156,6 +157,7 @@ class BaseCrawlOps:
        org: Optional[Organization] = None,
        type_: Optional[str] = None,
        skip_resources=False,
+        headers: Optional[dict] = None,
    ) -> CrawlOutWithResources:
        """Get crawl data for api output"""
        res = await self.get_crawl_raw(crawlid, org, type_)
@ -168,6 +170,16 @@ class BaseCrawlOps:
            if coll_ids:
                res["collections"] = await self.colls.get_collection_names(coll_ids)

+            res["initialPages"], _ = await self.page_ops.list_pages(
+                crawlid, is_seed=True, page_size=25
+            )
+
+            oid = res.get("oid")
+            if oid:
+                res["pagesQueryUrl"] = (
+                    get_origin(headers) + f"/api/orgs/{oid}/crawls/{crawlid}/pages"
+                )
+
        crawl = CrawlOutWithResources.from_dict(res)

        if not skip_resources:
@ -497,7 +509,7 @@ class BaseCrawlOps:

            out_files.append(
                CrawlFileOut(
-                    name=file_.filename,
+                    name=os.path.basename(file_.filename),
                    path=presigned_url or "",
                    hash=file_.hash,
                    size=file_.size,
--- a/backend/btrixcloud/colls.py
+++ b/backend/btrixcloud/colls.py
@ -53,16 +53,18 @@ from .models import (
    ImageFilePreparer,
    MIN_UPLOAD_PART_SIZE,
    PublicCollOut,
+    PreloadResource,
 )
-from .utils import dt_now, slug_from_name, get_duplicate_key_error_field
+from .utils import dt_now, slug_from_name, get_duplicate_key_error_field, get_origin

 if TYPE_CHECKING:
    from .orgs import OrgOps
    from .storages import StorageOps
    from .webhooks import EventWebhookOps
    from .crawls import CrawlOps
+    from .pages import PageOps
 else:
-    OrgOps = StorageOps = EventWebhookOps = CrawlOps = object
+    OrgOps = StorageOps = EventWebhookOps = CrawlOps = PageOps = object


 THUMBNAIL_MAX_SIZE = 2_000_000
@ -78,6 +80,7 @@ class CollectionOps:
    storage_ops: StorageOps
    event_webhook_ops: EventWebhookOps
    crawl_ops: CrawlOps
+    page_ops: PageOps

    def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):
        self.collections = mdb["collections"]
@ -337,12 +340,28 @@ class CollectionOps:
        org: Organization,
        resources=False,
        public_or_unlisted_only=False,
+        headers: Optional[dict] = None,
    ) -> CollOut:
        """Get CollOut by id"""
+        # pylint: disable=too-many-locals
        result = await self.get_collection_raw(coll_id, public_or_unlisted_only)

        if resources:
-            result["resources"] = await self.get_collection_crawl_resources(coll_id)
+            result["resources"], result["preloadResources"] = (
+                await self.get_collection_crawl_resources(
+                    coll_id, include_preloads=True
+                )
+            )
+
+            result["initialPages"], result["totalPages"] = (
+                await self.page_ops.list_collection_pages(coll_id, page_size=25)
+            )
+
+            public = "public/" if public_or_unlisted_only else ""
+            result["pagesQueryUrl"] = (
+                get_origin(headers)
+                + f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages"
+            )

        thumbnail = result.get("thumbnail")
        if thumbnail:
@ -369,7 +388,7 @@ class CollectionOps:
        if result.get("access") not in allowed_access:
            raise HTTPException(status_code=404, detail="collection_not_found")

-        result["resources"] = await self.get_collection_crawl_resources(coll_id)
+        result["resources"], _ = await self.get_collection_crawl_resources(coll_id)

        thumbnail = result.get("thumbnail")
        if thumbnail:
@ -468,7 +487,11 @@ class CollectionOps:
        collections: List[Union[CollOut, PublicCollOut]] = []

        for res in items:
-            res["resources"] = await self.get_collection_crawl_resources(res["_id"])
+            res["resources"], res["preloadResources"] = (
+                await self.get_collection_crawl_resources(
+                    res["_id"], include_preloads=not public_colls_out
+                )
+            )

            thumbnail = res.get("thumbnail")
            if thumbnail:
@ -490,12 +513,14 @@ class CollectionOps:

        return collections, total

-    async def get_collection_crawl_resources(self, coll_id: UUID):
+    async def get_collection_crawl_resources(
+        self, coll_id: UUID, include_preloads=False
+    ):
        """Return pre-signed resources for all collection crawl files."""
        # Ensure collection exists
        _ = await self.get_collection_raw(coll_id)

-        all_files = []
+        resources = []

        crawls, _ = await self.crawl_ops.list_all_base_crawls(
            collection_id=coll_id,
@ -506,9 +531,36 @@ class CollectionOps:

        for crawl in crawls:
            if crawl.resources:
-                all_files.extend(crawl.resources)
+                resources.extend(crawl.resources)

-        return all_files
+        preload_resources: List[PreloadResource] = []
+
+        if include_preloads:
+            no_page_items = await self.get_collection_resources_with_no_pages(crawls)
+            for item in no_page_items:
+                preload_resources.append(item)
+
+        return resources, preload_resources
+
+    async def get_collection_resources_with_no_pages(
+        self, crawls: List[CrawlOutWithResources]
+    ) -> List[PreloadResource]:
+        """Return wacz files in collection that have no pages"""
+        resources_no_pages: List[PreloadResource] = []
+
+        for crawl in crawls:
+            _, page_count = await self.page_ops.list_pages(crawl.id)
+            if page_count == 0 and crawl.resources:
+                for resource in crawl.resources:
+                    resources_no_pages.append(
+                        PreloadResource(
+                            name=os.path.basename(resource.name),
+                            crawlId=crawl.id,
+                            hasPages=False,
+                        )
+                    )
+
+        return resources_no_pages

    async def get_collection_names(self, uuids: List[UUID]):
        """return object of {_id, names} given list of collection ids"""
@ -528,9 +580,15 @@ class CollectionOps:
        names = [name for name in names if name]
        return {"names": names}

-    async def get_collection_crawl_ids(self, coll_id: UUID) -> List[str]:
-        """Return list of crawl ids in collection"""
+    async def get_collection_crawl_ids(
+        self, coll_id: UUID, public_or_unlisted_only=False
+    ) -> List[str]:
+        """Return list of crawl ids in collection, including only public collections"""
        crawl_ids = []
+        # ensure collection is public or unlisted, else throw here
+        if public_or_unlisted_only:
+            await self.get_collection_raw(coll_id, public_or_unlisted_only)
+
        async for crawl_raw in self.crawls.find(
            {"collectionIds": coll_id}, projection=["_id"]
        ):
@ -1010,8 +1068,8 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de
        try:
            all_collections, _ = await colls.list_collections(org, page_size=10_000)
            for collection in all_collections:
-                results[collection.name] = await colls.get_collection_crawl_resources(
-                    collection.id
+                results[collection.name], _ = (
+                    await colls.get_collection_crawl_resources(collection.id)
                )
        except Exception as exc:
            # pylint: disable=raise-missing-from
@ -1047,9 +1105,11 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de
        response_model=CollOut,
    )
    async def get_collection_replay(
-        coll_id: UUID, org: Organization = Depends(org_viewer_dep)
+        request: Request, coll_id: UUID, org: Organization = Depends(org_viewer_dep)
    ):
-        return await colls.get_collection_out(coll_id, org, resources=True)
+        return await colls.get_collection_out(
+            coll_id, org, resources=True, headers=dict(request.headers)
+        )

    @app.get(
        "/orgs/{oid}/collections/{coll_id}/public/replay.json",
@ -1057,12 +1117,17 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de
        response_model=CollOut,
    )
    async def get_collection_public_replay(
+        request: Request,
        response: Response,
        coll_id: UUID,
        org: Organization = Depends(org_public),
    ):
        coll = await colls.get_collection_out(
-            coll_id, org, resources=True, public_or_unlisted_only=True
+            coll_id,
+            org,
+            resources=True,
+            public_or_unlisted_only=True,
+            headers=dict(request.headers),
        )
        response.headers["Access-Control-Allow-Origin"] = "*"
        response.headers["Access-Control-Allow-Headers"] = "*"
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -881,14 +881,6 @@ class CrawlOut(BaseMongoModel):
    errorPageCount: Optional[int] = 0


-# ============================================================================
-class CrawlOutWithResources(CrawlOut):
-    """Crawl output model including resources"""
-
-    resources: Optional[List[CrawlFileOut]] = []
-    collections: Optional[List[CollIdName]] = []
-
-
 # ============================================================================
 class UpdateCrawl(BaseModel):
    """Update crawl"""
@ -1222,6 +1214,173 @@ class ImageFilePreparer(FilePreparer):
        )


+# ============================================================================
+
+### PAGES ###
+
+
+# ============================================================================
+class PageReviewUpdate(BaseModel):
+    """Update model for page manual review/approval"""
+
+    approved: Optional[bool] = None
+
+
+# ============================================================================
+class PageNoteIn(BaseModel):
+    """Input model for adding page notes"""
+
+    text: str
+
+
+# ============================================================================
+class PageNoteEdit(BaseModel):
+    """Input model for editing page notes"""
+
+    id: UUID
+    text: str
+
+
+# ============================================================================
+class PageNoteDelete(BaseModel):
+    """Delete model for page notes"""
+
+    delete_list: List[UUID] = []
+
+
+# ============================================================================
+class PageNote(BaseModel):
+    """Model for page notes, tracking user and time"""
+
+    id: UUID
+    text: str
+    created: datetime
+    userid: UUID
+    userName: str
+
+
+# ============================================================================
+class PageQACompare(BaseModel):
+    """Model for updating pages from QA run"""
+
+    screenshotMatch: Optional[float] = None
+    textMatch: Optional[float] = None
+    resourceCounts: Optional[Dict[str, int]] = None
+
+
+# ============================================================================
+class Page(BaseMongoModel):
+    """Core page data, no QA"""
+
+    id: UUID
+
+    oid: UUID
+    crawl_id: str
+
+    # core page data
+    url: AnyHttpUrl
+    title: Optional[str] = None
+    ts: Optional[datetime] = None
+    loadState: Optional[int] = None
+    status: Optional[int] = None
+    mime: Optional[str] = None
+    filename: Optional[str] = None
+    depth: Optional[int] = None
+    favIconUrl: Optional[AnyHttpUrl] = None
+    isSeed: Optional[bool] = False
+
+    # manual review
+    userid: Optional[UUID] = None
+    modified: Optional[datetime] = None
+    approved: Optional[bool] = None
+    notes: List[PageNote] = []
+
+    isFile: Optional[bool] = False
+    isError: Optional[bool] = False
+
+    def compute_page_type(self):
+        """sets self.isFile or self.isError flags"""
+        self.isFile = False
+        self.isError = False
+        if self.loadState == 2:
+            # pylint: disable=unsupported-membership-test
+            if self.mime and "html" not in self.mime:
+                self.isFile = True
+            elif self.title is None and self.status == 200:
+                self.isFile = True
+
+        elif self.loadState == 0:
+            self.isError = True
+
+
+# ============================================================================
+class PageWithAllQA(Page):
+    """Model for core page data + qa"""
+
+    # automated heuristics, keyed by QA run id
+    qa: Optional[Dict[str, PageQACompare]] = {}
+
+
+# ============================================================================
+class PageOut(Page):
+    """Model for pages output, no QA"""
+
+    status: int = 200
+
+
+# ============================================================================
+class PageOutWithSingleQA(Page):
+    """Page out with single QA entry"""
+
+    qa: Optional[PageQACompare] = None
+
+
+# ============================================================================
+class PageNoteAddedResponse(BaseModel):
+    """Model for response to adding page"""
+
+    added: bool
+    data: PageNote
+
+
+# ============================================================================
+class PageNoteUpdatedResponse(BaseModel):
+    """Model for response to updating page"""
+
+    updated: bool
+    data: PageNote
+
+
+# ============================================================================
+class PageIdTimestamp(BaseModel):
+    """Simplified model for page info to include in PageUrlCount"""
+
+    pageId: UUID
+    ts: Optional[datetime] = None
+    status: int = 200
+
+
+# ============================================================================
+class PageUrlCount(BaseModel):
+    """Model for counting pages by URL"""
+
+    url: AnyHttpUrl
+    count: int = 0
+    snapshots: List[PageIdTimestamp] = []
+
+
+# ============================================================================
+class CrawlOutWithResources(CrawlOut):
+    """Crawl output model including resources"""
+
+    resources: Optional[List[CrawlFileOut]] = []
+    collections: Optional[List[CollIdName]] = []
+
+    initialPages: List[PageOut] = []
+    totalPages: Optional[int] = None
+    pagesQueryUrl: str = ""
+
+
 # ============================================================================

 ### COLLECTIONS ###
@ -1245,6 +1404,15 @@ class CollectionThumbnailSource(BaseModel):
    urlPageId: UUID


+# ============================================================================
+class PreloadResource(BaseModel):
+    """Resources that will preloaded in RWP"""
+
+    name: str
+    crawlId: str
+    hasPages: bool
+
+
 # ============================================================================
 class Collection(BaseMongoModel):
    """Org collection structure"""
@ -1338,6 +1506,11 @@ class CollOut(BaseMongoModel):

    allowPublicDownload: bool = True

+    initialPages: List[PageOut] = []
+    totalPages: Optional[int] = None
+    preloadResources: List[PreloadResource] = []
+    pagesQueryUrl: str = ""
+

 # ============================================================================
 class PublicCollOut(BaseMongoModel):
@ -2435,161 +2608,6 @@ AnyJob = RootModel[
 ]


-# ============================================================================
-
-### PAGES ###
-
-
-# ============================================================================
-class PageReviewUpdate(BaseModel):
-    """Update model for page manual review/approval"""
-
-    approved: Optional[bool] = None
-
-
-# ============================================================================
-class PageNoteIn(BaseModel):
-    """Input model for adding page notes"""
-
-    text: str
-
-
-# ============================================================================
-class PageNoteEdit(BaseModel):
-    """Input model for editing page notes"""
-
-    id: UUID
-    text: str
-
-
-# ============================================================================
-class PageNoteDelete(BaseModel):
-    """Delete model for page notes"""
-
-    delete_list: List[UUID] = []
-
-
-# ============================================================================
-class PageNote(BaseModel):
-    """Model for page notes, tracking user and time"""
-
-    id: UUID
-    text: str
-    created: datetime
-    userid: UUID
-    userName: str
-
-
-# ============================================================================
-class PageQACompare(BaseModel):
-    """Model for updating pages from QA run"""
-
-    screenshotMatch: Optional[float] = None
-    textMatch: Optional[float] = None
-    resourceCounts: Optional[Dict[str, int]] = None
-
-
-# ============================================================================
-class Page(BaseMongoModel):
-    """Core page data, no QA"""
-
-    id: UUID
-
-    oid: UUID
-    crawl_id: str
-
-    # core page data
-    url: AnyHttpUrl
-    title: Optional[str] = None
-    ts: Optional[datetime] = None
-    loadState: Optional[int] = None
-    status: Optional[int] = None
-    mime: Optional[str] = None
-    filename: Optional[str] = None
-    depth: Optional[int] = None
-    favIconUrl: Optional[AnyHttpUrl] = None
-    isSeed: Optional[bool] = False
-
-    # manual review
-    userid: Optional[UUID] = None
-    modified: Optional[datetime] = None
-    approved: Optional[bool] = None
-    notes: List[PageNote] = []
-
-    isFile: Optional[bool] = False
-    isError: Optional[bool] = False
-
-    def compute_page_type(self):
-        """sets self.isFile or self.isError flags"""
-        self.isFile = False
-        self.isError = False
-        if self.loadState == 2:
-            # pylint: disable=unsupported-membership-test
-            if self.mime and "html" not in self.mime:
-                self.isFile = True
-            elif self.title is None and self.status == 200:
-                self.isFile = True
-
-        elif self.loadState == 0:
-            self.isError = True
-
-
-# ============================================================================
-class PageWithAllQA(Page):
-    """Model for core page data + qa"""
-
-    # automated heuristics, keyed by QA run id
-    qa: Optional[Dict[str, PageQACompare]] = {}
-
-
-# ============================================================================
-class PageOut(Page):
-    """Model for pages output, no QA"""
-
-    status: int = 200
-
-
-# ============================================================================
-class PageOutWithSingleQA(Page):
-    """Page out with single QA entry"""
-
-    qa: Optional[PageQACompare] = None
-
-
-# ============================================================================
-class PageNoteAddedResponse(BaseModel):
-    """Model for response to adding page"""
-
-    added: bool
-    data: PageNote
-
-
-# ============================================================================
-class PageNoteUpdatedResponse(BaseModel):
-    """Model for response to updating page"""
-
-    updated: bool
-    data: PageNote
-
-
-# ============================================================================
-class PageIdTimestamp(BaseModel):
-    """Simplified model for page info to include in PageUrlCount"""
-
-    pageId: UUID
-    ts: Optional[datetime] = None
-    status: int = 200
-
-
-# ============================================================================
-class PageUrlCount(BaseModel):
-    """Model for counting pages by URL"""
-
-    url: AnyHttpUrl
-    count: int = 0
-    snapshots: List[PageIdTimestamp] = []
-
-
 # ============================================================================

 ### GENERIC RESPONSE MODELS ###
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@ -11,7 +11,7 @@ from datetime import datetime
 from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
 from uuid import UUID, uuid4

-from fastapi import Depends, HTTPException, Request
+from fastapi import Depends, HTTPException, Request, Response
 import pymongo

 from .models import (
@ -35,6 +35,7 @@ from .models import (
    DeletedResponse,
    PageNoteAddedResponse,
    PageNoteUpdatedResponse,
+    EmptyResponse,
 )
 from .pagination import DEFAULT_PAGE_SIZE, paginated_format
 from .utils import str_to_date, str_list_to_bools, dt_now
@ -503,6 +504,7 @@ class PageOps:
        self,
        crawl_id: str,
        org: Optional[Organization] = None,
+        search: Optional[str] = None,
        url: Optional[str] = None,
        url_prefix: Optional[str] = None,
        ts: Optional[datetime] = None,
@ -534,6 +536,13 @@ class PageOps:
        if org:
            query["oid"] = org.id

+        if search:
+            search_regex = re.escape(urllib.parse.unquote(search))
+            query["$or"] = [
+                {"url": {"$regex": search_regex, "$options": "i"}},
+                {"title": {"$regex": search_regex, "$options": "i"}},
+            ]
+
        if url_prefix:
            url_prefix = urllib.parse.unquote(url_prefix)
            regex_pattern = f"^{re.escape(url_prefix)}"
@ -661,6 +670,7 @@ class PageOps:
        self,
        coll_id: UUID,
        org: Optional[Organization] = None,
+        search: Optional[str] = None,
        url: Optional[str] = None,
        url_prefix: Optional[str] = None,
        ts: Optional[datetime] = None,
@ -670,6 +680,7 @@ class PageOps:
        page: int = 1,
        sort_by: Optional[str] = None,
        sort_direction: Optional[int] = -1,
+        public_or_unlisted_only=False,
    ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]:
        """List all pages in collection, with optional filtering"""
        # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements
@ -677,7 +688,9 @@ class PageOps:
        page = page - 1
        skip = page_size * page

-        crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id)
+        crawl_ids = await self.coll_ops.get_collection_crawl_ids(
+            coll_id, public_or_unlisted_only
+        )

        query: dict[str, object] = {
            "crawl_id": {"$in": crawl_ids},
@ -685,7 +698,14 @@ class PageOps:
        if org:
            query["oid"] = org.id

-        if url_prefix:
+        if search:
+            search_regex = re.escape(urllib.parse.unquote(search))
+            query["$or"] = [
+                {"url": {"$regex": search_regex, "$options": "i"}},
+                {"title": {"$regex": search_regex, "$options": "i"}},
+            ]
+
+        elif url_prefix:
            url_prefix = urllib.parse.unquote(url_prefix)
            regex_pattern = f"^{re.escape(url_prefix)}"
            query["url"] = {"$regex": regex_pattern, "$options": "i"}
@ -724,6 +744,9 @@ class PageOps:
                raise HTTPException(status_code=400, detail="invalid_sort_direction")

            aggregate.extend([{"$sort": {sort_by: sort_direction}}])
+        else:
+            # default sort: seeds first, then by timestamp
+            aggregate.extend([{"$sort": {"isSeed": -1, "ts": 1}}])

        aggregate.extend(
            [
@ -886,6 +909,7 @@ def init_pages_api(

    org_viewer_dep = org_ops.org_viewer_dep
    org_crawl_dep = org_ops.org_crawl_dep
+    org_public = org_ops.org_public

    @app.post(
        "/orgs/{oid}/crawls/all/pages/reAdd",
@ -1056,6 +1080,7 @@ def init_pages_api(
    async def get_crawl_pages_list(
        crawl_id: str,
        org: Organization = Depends(org_crawl_dep),
+        search: Optional[str] = None,
        url: Optional[str] = None,
        urlPrefix: Optional[str] = None,
        ts: Optional[datetime] = None,
@ -1077,6 +1102,7 @@ def init_pages_api(
        pages, total = await ops.list_pages(
            crawl_id=crawl_id,
            org=org,
+            search=search,
            url=url,
            url_prefix=urlPrefix,
            ts=ts,
@ -1093,13 +1119,15 @@ def init_pages_api(
        return paginated_format(pages, total, page, pageSize)

    @app.get(
-        "/orgs/{oid}/collections/{coll_id}/pages",
+        "/orgs/{oid}/collections/{coll_id}/public/pages",
        tags=["pages", "collections"],
        response_model=PaginatedPageOutResponse,
    )
-    async def get_collection_pages_list(
+    async def get_public_collection_pages_list(
        coll_id: UUID,
-        org: Organization = Depends(org_viewer_dep),
+        response: Response,
+        org: Organization = Depends(org_public),
+        search: Optional[str] = None,
        url: Optional[str] = None,
        urlPrefix: Optional[str] = None,
        ts: Optional[datetime] = None,
@ -1114,6 +1142,58 @@ def init_pages_api(
        pages, total = await ops.list_collection_pages(
            coll_id=coll_id,
            org=org,
+            search=search,
+            url=url,
+            url_prefix=urlPrefix,
+            ts=ts,
+            is_seed=isSeed,
+            depth=depth,
+            page_size=pageSize,
+            page=page,
+            sort_by=sortBy,
+            sort_direction=sortDirection,
+            public_or_unlisted_only=True,
+        )
+
+        response.headers["Access-Control-Allow-Origin"] = "*"
+        response.headers["Access-Control-Allow-Headers"] = "*"
+        return paginated_format(pages, total, page, pageSize)
+
+    @app.options(
+        "/orgs/{oid}/collections/{coll_id}/public/pages",
+        tags=["pages", "collections"],
+        response_model=EmptyResponse,
+    )
+    async def get_replay_preflight(response: Response):
+        response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS"
+        response.headers["Access-Control-Allow-Origin"] = "*"
+        response.headers["Access-Control-Allow-Headers"] = "*"
+        return {}
+
+    @app.get(
+        "/orgs/{oid}/collections/{coll_id}/pages",
+        tags=["pages", "collections"],
+        response_model=PaginatedPageOutResponse,
+    )
+    async def get_collection_pages_list(
+        coll_id: UUID,
+        org: Organization = Depends(org_viewer_dep),
+        search: Optional[str] = None,
+        url: Optional[str] = None,
+        urlPrefix: Optional[str] = None,
+        ts: Optional[datetime] = None,
+        isSeed: Optional[bool] = None,
+        depth: Optional[int] = None,
+        pageSize: int = DEFAULT_PAGE_SIZE,
+        page: int = 1,
+        sortBy: Optional[str] = None,
+        sortDirection: Optional[int] = -1,
+    ):
+        """Retrieve paginated list of pages in collection"""
+        pages, total = await ops.list_collection_pages(
+            coll_id=coll_id,
+            org=org,
+            search=search,
            url=url,
            url_prefix=urlPrefix,
            ts=ts,
--- a/backend/test/test_collections.py
+++ b/backend/test/test_collections.py
@ -382,6 +382,11 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id):
    assert data["dateEarliest"]
    assert data["dateLatest"]
    assert data["defaultThumbnailName"]
+    assert data["initialPages"]
+    assert data["pagesQueryUrl"].endswith(
+        f"/orgs/{default_org_id}/collections/{_coll_id}/pages"
+    )
+    assert "preloadResources" in data

    resources = data["resources"]
    assert resources
@ -413,10 +418,27 @@ def test_collection_public(crawler_auth_headers, default_org_id):
        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
        headers=crawler_auth_headers,
    )
+    data = r.json()
+    assert data["initialPages"]
+    assert data["pagesQueryUrl"].endswith(
+        f"/orgs/{default_org_id}/collections/{_coll_id}/public/pages"
+    )
+    assert "preloadResources" in data
+
    assert r.status_code == 200
    assert r.headers["Access-Control-Allow-Origin"] == "*"
    assert r.headers["Access-Control-Allow-Headers"] == "*"

+    # test public pages endpoint
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] > 0
+    assert data["items"]
+
    # make unlisted and test replay headers
    r = requests.patch(
        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
@ -451,6 +473,12 @@ def test_collection_public(crawler_auth_headers, default_org_id):
    )
    assert r.status_code == 404

+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 404
+

 def test_collection_access_invalid_value(crawler_auth_headers, default_org_id):
    r = requests.patch(
@ -614,6 +642,39 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id):
    coll_page_id = coll_page["id"]
    coll_page_url = coll_page["url"]
    coll_page_ts = coll_page["ts"]
+    coll_page_title = coll_page["title"]
+
+    # Test search filter
+    partial_title = coll_page_title[:5]
+    partial_url = coll_page_url[:8]
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title}",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    assert data["total"] >= 1
+    for matching_page in data["items"]:
+        assert (
+            partial_title in matching_page["title"]
+            or partial_url in matching_page["url"]
+        )
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_url}",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    assert data["total"] >= 1
+    for matching_page in data["items"]:
+        assert (
+            partial_title in matching_page["title"]
+            or partial_url in matching_page["url"]
+        )

    # Test exact url filter
    r = requests.get(
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@ -184,6 +184,11 @@ def test_wait_for_complete(admin_auth_headers, default_org_id):
    assert len(data["resources"]) == 1
    assert data["resources"][0]["path"]

+    assert len(data["initialPages"]) == 1
+    assert data["pagesQueryUrl"].endswith(
+        f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pages"
+    )
+
    # ensure filename matches specified pattern
    # set in default_crawl_filename_template
    assert re.search("/[\\d]+-testing-[\\w-]+\\.wacz", data["resources"][0]["path"])