Add collection page list/search endpoint (#2354)

Fixes #2353 Adds a new endpoint to list pages in a collection, with filtering available on `url` (exact match), `ts`, `urlPrefix`, `isSeed`, and `depth`, as well as accompanying tests. Additional sort options have been added as well. These same filters and sort options have also been added to the crawl pages endpoint. Also fixes an issue where `isSeed` wasn't being set in the database when false but only added on serialization, which was preventing filtering from working as expected.
2025-02-10 19:44:37 -05:00 · 2025-02-10 19:44:37 -05:00 · 98a45b0d85
commit 98a45b0d85
parent 001839a521
6 changed files with 423 additions and 23 deletions
--- a/backend/btrixcloud/main.py
+++ b/backend/btrixcloud/main.py
@ -248,7 +248,14 @@ def main() -> None:
    upload_ops = init_uploads_api(*base_crawl_init)

    page_ops = init_pages_api(
-        app, mdb, crawls, org_ops, storage_ops, background_job_ops, current_active_user
+        app,
+        mdb,
+        crawls,
+        org_ops,
+        storage_ops,
+        background_job_ops,
+        coll_ops,
+        current_active_user,
    )

    base_crawl_ops.set_page_ops(page_ops)
--- a/backend/btrixcloud/ops.py
+++ b/backend/btrixcloud/ops.py
@ -89,7 +89,9 @@ def init_ops() -> Tuple[

    upload_ops = UploadOps(*base_crawl_init)

-    page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops)
+    page_ops = PageOps(
+        mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops
+    )

    base_crawl_ops.set_page_ops(page_ops)
    crawl_ops.set_page_ops(page_ops)
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@ -1,8 +1,12 @@
 """crawl pages"""

+# pylint: disable=too-many-lines
+
 import asyncio
 import os
+import re
 import traceback
+import urllib.parse
 from datetime import datetime
 from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
 from uuid import UUID, uuid4
@ -37,11 +41,12 @@ from .utils import str_to_date, str_list_to_bools, dt_now

 if TYPE_CHECKING:
    from .background_jobs import BackgroundJobOps
+    from .colls import CollectionOps
    from .crawls import CrawlOps
    from .orgs import OrgOps
    from .storages import StorageOps
 else:
-    CrawlOps = StorageOps = OrgOps = BackgroundJobOps = object
+    CrawlOps = StorageOps = OrgOps = BackgroundJobOps = CollectionOps = object


 # ============================================================================
@ -53,14 +58,18 @@ class PageOps:
    org_ops: OrgOps
    storage_ops: StorageOps
    background_job_ops: BackgroundJobOps
+    coll_ops: CollectionOps

-    def __init__(self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops):
+    def __init__(
+        self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops
+    ):
        self.pages = mdb["pages"]
        self.crawls = mdb["crawls"]
        self.crawl_ops = crawl_ops
        self.org_ops = org_ops
        self.storage_ops = storage_ops
        self.background_job_ops = background_job_ops
+        self.coll_ops = coll_ops

    async def init_index(self):
        """init index for pages db collection"""
@ -82,6 +91,9 @@ class PageOps:
                if not page_dict.get("url"):
                    continue

+                if not page_dict.get("isSeed"):
+                    page_dict["isSeed"] = False
+
                if len(pages_buffer) > batch_size:
                    await self._add_pages_to_db(crawl_id, pages_buffer)
                    pages_buffer = []
@ -210,9 +222,8 @@ class PageOps:
    ):
        """Add page to database"""
        page = self._get_page_from_dict(page_dict, crawl_id, oid)
-        page_to_insert = page.to_dict(
-            exclude_unset=True, exclude_none=True, exclude_defaults=True
-        )
+
+        page_to_insert = page.to_dict(exclude_unset=True, exclude_none=True)

        try:
            await self.pages.insert_one(page_to_insert)
@ -492,6 +503,11 @@ class PageOps:
        self,
        crawl_id: str,
        org: Optional[Organization] = None,
+        url: Optional[str] = None,
+        url_prefix: Optional[str] = None,
+        ts: Optional[datetime] = None,
+        is_seed: Optional[bool] = None,
+        depth: Optional[int] = None,
        qa_run_id: Optional[str] = None,
        qa_filter_by: Optional[str] = None,
        qa_gte: Optional[float] = None,
@ -518,6 +534,23 @@ class PageOps:
        if org:
            query["oid"] = org.id

+        if url_prefix:
+            url_prefix = urllib.parse.unquote(url_prefix)
+            regex_pattern = f"^{re.escape(url_prefix)}"
+            query["url"] = {"$regex": regex_pattern, "$options": "i"}
+
+        elif url:
+            query["url"] = urllib.parse.unquote(url)
+
+        if ts:
+            query["ts"] = ts
+
+        if is_seed in (True, False):
+            query["isSeed"] = is_seed
+
+        if isinstance(depth, int):
+            query["depth"] = depth
+
        if reviewed:
            query["$or"] = [
                {"approved": {"$ne": None}},
@ -562,7 +595,18 @@ class PageOps:
            # Sorting options to add:
            # - automated heuristics like screenshot_comparison (dict keyed by QA run id)
            # - Ensure notes sorting works okay with notes in list
-            sort_fields = ("url", "title", "notes", "approved")
+            sort_fields = (
+                "url",
+                "title",
+                "notes",
+                "approved",
+                "ts",
+                "status",
+                "mime",
+                "filename",
+                "depth",
+                "isSeed",
+            )
            qa_sort_fields = ("screenshotMatch", "textMatch")
            if sort_by not in sort_fields and sort_by not in qa_sort_fields:
                raise HTTPException(status_code=400, detail="invalid_sort_by")
@ -613,6 +657,101 @@ class PageOps:

        return [PageOut.from_dict(data) for data in items], total

+    async def list_collection_pages(
+        self,
+        coll_id: UUID,
+        org: Optional[Organization] = None,
+        url: Optional[str] = None,
+        url_prefix: Optional[str] = None,
+        ts: Optional[datetime] = None,
+        is_seed: Optional[bool] = None,
+        depth: Optional[int] = None,
+        page_size: int = DEFAULT_PAGE_SIZE,
+        page: int = 1,
+        sort_by: Optional[str] = None,
+        sort_direction: Optional[int] = -1,
+    ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]:
+        """List all pages in collection, with optional filtering"""
+        # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements
+        # Zero-index page for query
+        page = page - 1
+        skip = page_size * page
+
+        crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id)
+
+        query: dict[str, object] = {
+            "crawl_id": {"$in": crawl_ids},
+        }
+        if org:
+            query["oid"] = org.id
+
+        if url_prefix:
+            url_prefix = urllib.parse.unquote(url_prefix)
+            regex_pattern = f"^{re.escape(url_prefix)}"
+            query["url"] = {"$regex": regex_pattern, "$options": "i"}
+
+        elif url:
+            query["url"] = urllib.parse.unquote(url)
+
+        if ts:
+            query["ts"] = ts
+
+        if is_seed in (True, False):
+            query["isSeed"] = is_seed
+
+        if isinstance(depth, int):
+            query["depth"] = depth
+
+        aggregate = [{"$match": query}]
+
+        if sort_by:
+            # Sorting options to add:
+            # - automated heuristics like screenshot_comparison (dict keyed by QA run id)
+            # - Ensure notes sorting works okay with notes in list
+            sort_fields = (
+                "url",
+                "crawl_id",
+                "ts",
+                "status",
+                "mime",
+                "filename",
+                "depth",
+                "isSeed",
+            )
+            if sort_by not in sort_fields:
+                raise HTTPException(status_code=400, detail="invalid_sort_by")
+            if sort_direction not in (1, -1):
+                raise HTTPException(status_code=400, detail="invalid_sort_direction")
+
+            aggregate.extend([{"$sort": {sort_by: sort_direction}}])
+
+        aggregate.extend(
+            [
+                {
+                    "$facet": {
+                        "items": [
+                            {"$skip": skip},
+                            {"$limit": page_size},
+                        ],
+                        "total": [{"$count": "count"}],
+                    }
+                },
+            ]
+        )
+
+        # Get total
+        cursor = self.pages.aggregate(aggregate)
+        results = await cursor.to_list(length=1)
+        result = results[0]
+        items = result["items"]
+
+        try:
+            total = int(result["total"][0]["count"])
+        except (IndexError, ValueError):
+            total = 0
+
+        return [PageOut.from_dict(data) for data in items], total
+
    async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
        """Delete existing pages for crawl and re-add from WACZs."""
        await self.delete_crawl_pages(crawl_id, oid)
@ -738,13 +877,14 @@ class PageOps:
 # ============================================================================
 # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
 def init_pages_api(
-    app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, user_dep
+    app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep
 ):
    """init pages API"""
    # pylint: disable=invalid-name

-    ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops)
+    ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops)

+    org_viewer_dep = org_ops.org_viewer_dep
    org_crawl_dep = org_ops.org_crawl_dep

    @app.post(
@ -913,9 +1053,14 @@ def init_pages_api(
        tags=["pages", "all-crawls"],
        response_model=PaginatedPageOutResponse,
    )
-    async def get_pages_list(
+    async def get_crawl_pages_list(
        crawl_id: str,
        org: Organization = Depends(org_crawl_dep),
+        url: Optional[str] = None,
+        urlPrefix: Optional[str] = None,
+        ts: Optional[datetime] = None,
+        isSeed: Optional[bool] = None,
+        depth: Optional[int] = None,
        reviewed: Optional[bool] = None,
        approved: Optional[str] = None,
        hasNotes: Optional[bool] = None,
@ -932,6 +1077,11 @@ def init_pages_api(
        pages, total = await ops.list_pages(
            crawl_id=crawl_id,
            org=org,
+            url=url,
+            url_prefix=urlPrefix,
+            ts=ts,
+            is_seed=isSeed,
+            depth=depth,
            reviewed=reviewed,
            approved=formatted_approved,
            has_notes=hasNotes,
@ -942,6 +1092,40 @@ def init_pages_api(
        )
        return paginated_format(pages, total, page, pageSize)

+    @app.get(
+        "/orgs/{oid}/collections/{coll_id}/pages",
+        tags=["pages", "collections"],
+        response_model=PaginatedPageOutResponse,
+    )
+    async def get_collection_pages_list(
+        coll_id: UUID,
+        org: Organization = Depends(org_viewer_dep),
+        url: Optional[str] = None,
+        urlPrefix: Optional[str] = None,
+        ts: Optional[datetime] = None,
+        isSeed: Optional[bool] = None,
+        depth: Optional[int] = None,
+        pageSize: int = DEFAULT_PAGE_SIZE,
+        page: int = 1,
+        sortBy: Optional[str] = None,
+        sortDirection: Optional[int] = -1,
+    ):
+        """Retrieve paginated list of pages in collection"""
+        pages, total = await ops.list_collection_pages(
+            coll_id=coll_id,
+            org=org,
+            url=url,
+            url_prefix=urlPrefix,
+            ts=ts,
+            is_seed=isSeed,
+            depth=depth,
+            page_size=pageSize,
+            page=page,
+            sort_by=sortBy,
+            sort_direction=sortDirection,
+        )
+        return paginated_format(pages, total, page, pageSize)
+
    @app.get(
        "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/pages",
        tags=["pages", "qa"],
--- a/backend/test/conftest.py
+++ b/backend/test/conftest.py
@ -232,7 +232,7 @@ def crawler_crawl_id(crawler_auth_headers, default_org_id):
        "name": "Crawler User Test Crawl",
        "description": "crawler test crawl",
        "tags": ["wr-test-2"],
-        "config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 1},
+        "config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 3},
        "crawlerChannel": "test",
    }
    r = requests.post(
--- a/backend/test/test_collections.py
+++ b/backend/test/test_collections.py
@ -582,6 +582,121 @@ def test_list_collections(
    assert second_coll["dateLatest"]


+def test_list_pages_in_collection(crawler_auth_headers, default_org_id):
+    # Test list endpoint
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    assert data["total"] >= 0
+
+    pages = data["items"]
+    assert pages
+
+    for page in pages:
+        assert page["id"]
+        assert page["oid"]
+        assert page["crawl_id"]
+        assert page["url"]
+        assert page["ts"]
+        assert page.get("title") or page.get("title") is None
+        assert page.get("loadState") or page.get("loadState") is None
+        assert page.get("status") or page.get("status") is None
+        assert page.get("mime") or page.get("mime") is None
+        assert page["isError"] in (None, True, False)
+        assert page["isFile"] in (None, True, False)
+
+    # Save info for page to test url and urlPrefix filters
+    coll_page = pages[0]
+    coll_page_id = coll_page["id"]
+    coll_page_url = coll_page["url"]
+    coll_page_ts = coll_page["ts"]
+
+    # Test exact url filter
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    assert data["total"] >= 1
+    for matching_page in data["items"]:
+        assert matching_page["url"] == coll_page_url
+
+    # Test exact url and ts filters together
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}&ts={coll_page_ts}",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    assert data["total"] >= 1
+    for matching_page in data["items"]:
+        assert matching_page["url"] == coll_page_url
+        assert matching_page["ts"] == coll_page_ts
+
+    # Test urlPrefix filter
+    url_prefix = coll_page_url[:8]
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?urlPrefix={url_prefix}",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    assert data["total"] >= 1
+
+    found_matching_page = False
+    for page in data["items"]:
+        if page["id"] == coll_page_id and page["url"] == coll_page_url:
+            found_matching_page = True
+
+    assert found_matching_page
+
+    # Test isSeed filter
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=true",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    for page in data["items"]:
+        assert page["isSeed"]
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=false",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    for page in data["items"]:
+        assert page["isSeed"] is False
+
+    # Test depth filter
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=0",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    for page in data["items"]:
+        assert page["depth"] == 0
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=1",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    for page in data["items"]:
+        assert page["depth"] == 1
+
+
 def test_remove_upload_from_collection(crawler_auth_headers, default_org_id):
    # Remove upload
    r = requests.post(
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@ -658,7 +658,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
    )
    assert r.status_code == 200
    data = r.json()
-    assert data["total"] >= 0
+
+    assert data["total"] == 3

    pages = data["items"]
    assert pages
@ -682,7 +683,11 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):

    # Test GET page endpoint
    global page_id
-    page_id = pages[0]["id"]
+    test_page = pages[0]
+    page_id = test_page["id"]
+    test_page_url = test_page["url"]
+    test_page_ts = test_page["ts"]
+
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
        headers=crawler_auth_headers,
@ -710,13 +715,100 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
    assert page.get("modified") is None
    assert page.get("approved") is None

+    # Test exact url filter
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?url={test_page_url}",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    assert data["total"] >= 1
+    for matching_page in data["items"]:
+        assert matching_page["url"] == test_page_url
+
+    # Test exact url and ts filters together
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?url={test_page_url}&ts={test_page_ts}",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    assert data["total"] >= 1
+    for matching_page in data["items"]:
+        assert matching_page["url"] == test_page_url
+        assert matching_page["ts"] == test_page_ts
+
+    # Test urlPrefix filter
+    url_prefix = test_page_url[:8]
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?urlPrefix={url_prefix}",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    assert data["total"] >= 1
+
+    found_matching_page = False
+    for page in data["items"]:
+        if page["id"] == page_id and page["url"] == test_page_url:
+            found_matching_page = True
+
+    assert found_matching_page
+
+    # Test isSeed filter
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?isSeed=True",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 1
+    for page in data["items"]:
+        assert page["isSeed"]
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?isSeed=False",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 2
+    for page in data["items"]:
+        assert page["isSeed"] is False
+
+    # Test depth filter
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?depth=0",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 1
+    for page in data["items"]:
+        assert page["depth"] == 0
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?depth=1",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 2
+    for page in data["items"]:
+        assert page["depth"] == 1
+
+
+def test_crawl_pages_qa_filters(crawler_auth_headers, default_org_id, crawler_crawl_id):
    # Test reviewed filter (page has no notes or approved so should show up in false)
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=False",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
-    assert r.json()["total"] == 1
+    assert r.json()["total"] == 3

    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True",
@ -770,15 +862,15 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
-    assert r.json()["total"] == 0
+    assert r.json()["total"] == 2

-    # Test reviewed filter (page now approved so should show up in True)
+    # Test reviewed filter (page now approved so should show up in True, other pages show here)
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=False",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
-    assert r.json()["total"] == 0
+    assert r.json()["total"] == 2

    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True",
@ -853,7 +945,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
-    assert r.json()["total"] == 0
+    assert r.json()["total"] == 2


 def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
@ -985,14 +1077,14 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
-    assert r.json()["total"] == 1
+    assert r.json()["total"] == 3

    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=true,false,none",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
-    assert r.json()["total"] == 1
+    assert r.json()["total"] == 3

    # Test reviewed filter (page now has notes so should show up in True)
    r = requests.get(
@ -1000,7 +1092,7 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
-    assert r.json()["total"] == 0
+    assert r.json()["total"] == 2

    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True",
@ -1015,7 +1107,7 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
-    assert r.json()["total"] == 0
+    assert r.json()["total"] == 2

    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?hasNotes=True",