Backend: standardize search values, filters, and sorting for archived items (#1039)

- all-crawls list endpoint filters now conform to 'Standardize list controls for archived items #1025' and URL decode values before passing them in - Uploads list endpoint now includes all all-crawls filters relevant to uploads - An all-crawls/search-values endpoint is added to support searching across all archived item types - Crawl configuration names are now copied to the crawl when the crawl is created, and crawl names and descriptions are now editable via the backend API (note: this will require frontend changes as well to make them editable via the UI) - Migration added to copy existing config names for active configs into their associated crawls. This migration has been tested in a local deployment - New statuses generate-wacz, uploading-wacz, and pending-wait are added when relevant to tests to ensure that they pass - Tests coverage added for all new all-crawls endpoints, filters, and sort values
2023-08-04 12:56:52 -04:00 · 2023-08-04 12:56:52 -04:00 · 7ff57ce6b5
commit 7ff57ce6b5
parent 9236a07800
11 changed files with 505 additions and 36 deletions
--- a/backend/btrixcloud/basecrawls.py
+++ b/backend/btrixcloud/basecrawls.py
@ -5,6 +5,7 @@ import uuid
 import os
 from datetime import timedelta
 from typing import Optional, List, Union
+import urllib.parse

 from pydantic import UUID4
 from fastapi import HTTPException, Depends
@ -196,16 +197,11 @@ class BaseCrawlOps:
        config = await self.crawl_configs.get_crawl_config(
            crawl.cid, org, active_only=False
        )
-
-        if config:
-            if not crawl.name:
-                crawl.name = config.name
-
-            if config.config.seeds:
-                if add_first_seed:
-                    first_seed = config.config.seeds[0]
-                    crawl.firstSeed = first_seed.url
-                crawl.seedCount = len(config.config.seeds)
+        if config and config.config.seeds:
+            if add_first_seed:
+                first_seed = config.config.seeds[0]
+                crawl.firstSeed = first_seed.url
+            crawl.seedCount = len(config.config.seeds)

        if hasattr(crawl, "profileid") and crawl.profileid:
            crawl.profileName = await self.crawl_configs.profiles.get_profile_name(
@ -327,7 +323,7 @@ class BaseCrawlOps:
            {"$pull": {"collections": collection_id}},
        )

-    # pylint: disable=too-many-branches
+    # pylint: disable=too-many-branches, invalid-name
    async def list_all_base_crawls(
        self,
        org: Optional[Organization] = None,
@ -336,12 +332,14 @@ class BaseCrawlOps:
        description: str = None,
        collection_id: str = None,
        states: Optional[List[str]] = None,
+        first_seed: Optional[str] = None,
+        type_: Optional[str] = None,
+        cid: Optional[UUID4] = None,
        cls_type: Union[CrawlOut, CrawlOutWithResources] = CrawlOut,
        page_size: int = DEFAULT_PAGE_SIZE,
        page: int = 1,
        sort_by: str = None,
        sort_direction: int = -1,
-        type_=None,
    ):
        """List crawls of all types from the db"""
        # Zero-index page for query
@ -367,7 +365,15 @@ class BaseCrawlOps:
            # validated_states = [value for value in state if value in ALL_CRAWL_STATES]
            query["state"] = {"$in": states}

-        aggregate = [{"$match": query}, {"$unset": "errors"}]
+        if cid:
+            query["cid"] = cid
+
+        aggregate = [
+            {"$match": query},
+            {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
+            {"$set": {"firstSeed": "$firstSeedObject.url"}},
+            {"$unset": ["firstSeedObject", "errors"]},
+        ]

        if not resources:
            aggregate.extend([{"$unset": ["files"]}])
@ -375,6 +381,9 @@ class BaseCrawlOps:
        if name:
            aggregate.extend([{"$match": {"name": name}}])

+        if first_seed:
+            aggregate.extend([{"$match": {"firstSeed": first_seed}}])
+
        if description:
            aggregate.extend([{"$match": {"description": description}}])

@ -382,7 +391,7 @@ class BaseCrawlOps:
            aggregate.extend([{"$match": {"collections": {"$in": [collection_id]}}}])

        if sort_by:
-            if sort_by not in ("started", "finished"):
+            if sort_by not in ("started", "finished", "fileSize"):
                raise HTTPException(status_code=400, detail="invalid_sort_by")
            if sort_direction not in (1, -1):
                raise HTTPException(status_code=400, detail="invalid_sort_direction")
@ -447,13 +456,40 @@ class BaseCrawlOps:

        return {"deleted": True}

+    async def get_all_crawl_search_values(self, org: Organization):
+        """List unique names, first seeds, and descriptions from all captures in org"""
+        names = await self.crawls.distinct("name", {"oid": org.id})
+        descriptions = await self.crawls.distinct("description", {"oid": org.id})
+        crawl_ids = await self.crawls.distinct("_id", {"oid": org.id})
+        cids = await self.crawls.distinct("cid", {"oid": org.id})
+
+        # Remove empty strings
+        names = [name for name in names if name]
+        descriptions = [description for description in descriptions if description]
+
+        # Get first seeds
+        first_seeds = set()
+        for cid in cids:
+            if not cid:
+                continue
+            config = await self.crawl_configs.get_crawl_config(cid, org)
+            first_seed = config.config.seeds[0]
+            first_seeds.add(first_seed.url)
+
+        return {
+            "names": names,
+            "descriptions": descriptions,
+            "firstSeeds": list(first_seeds),
+            "crawlIds": list(crawl_ids),
+        }
+

 # ============================================================================
 def init_base_crawls_api(
    app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep
 ):
    """base crawls api"""
-    # pylint: disable=invalid-name, duplicate-code, too-many-arguments
+    # pylint: disable=invalid-name, duplicate-code, too-many-arguments, too-many-locals

    ops = BaseCrawlOps(mdb, users, crawl_config_ops, crawl_manager)

@ -472,12 +508,28 @@ def init_base_crawls_api(
        userid: Optional[UUID4] = None,
        name: Optional[str] = None,
        state: Optional[str] = None,
+        firstSeed: Optional[str] = None,
        description: Optional[str] = None,
        collectionId: Optional[UUID4] = None,
+        crawlType: Optional[str] = None,
+        cid: Optional[UUID4] = None,
        sortBy: Optional[str] = "finished",
        sortDirection: Optional[int] = -1,
    ):
        states = state.split(",") if state else None
+
+        if firstSeed:
+            firstSeed = urllib.parse.unquote(firstSeed)
+
+        if name:
+            name = urllib.parse.unquote(name)
+
+        if description:
+            description = urllib.parse.unquote(description)
+
+        if crawlType and crawlType not in ("crawl", "upload"):
+            raise HTTPException(status_code=400, detail="invalid_crawl_type")
+
        crawls, total = await ops.list_all_base_crawls(
            org,
            userid=userid,
@ -485,6 +537,9 @@ def init_base_crawls_api(
            description=description,
            collection_id=collectionId,
            states=states,
+            first_seed=firstSeed,
+            type_=crawlType,
+            cid=cid,
            page_size=pageSize,
            page=page,
            sort_by=sortBy,
@ -492,6 +547,12 @@ def init_base_crawls_api(
        )
        return paginated_format(crawls, total, page, pageSize)

+    @app.get("/orgs/{oid}/all-crawls/search-values", tags=["all-crawls"])
+    async def get_all_crawls_search_values(
+        org: Organization = Depends(org_viewer_dep),
+    ):
+        return await ops.get_all_crawl_search_values(org)
+
    @app.get(
        "/orgs/{oid}/all-crawls/{crawl_id}",
        tags=["all-crawls"],
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -68,11 +68,15 @@ class CrawlOps(BaseCrawlOps):
        await self.crawls.create_index(
            [("type", pymongo.HASHED), ("state", pymongo.DESCENDING)]
        )
+        await self.crawls.create_index(
+            [("type", pymongo.HASHED), ("fileSize", pymongo.DESCENDING)]
+        )

        await self.crawls.create_index([("finished", pymongo.DESCENDING)])
        await self.crawls.create_index([("oid", pymongo.HASHED)])
        await self.crawls.create_index([("cid", pymongo.HASHED)])
        await self.crawls.create_index([("state", pymongo.HASHED)])
+        await self.crawls.create_index([("fileSize", pymongo.DESCENDING)])

    async def list_crawls(
        self,
@ -127,15 +131,6 @@ class CrawlOps(BaseCrawlOps):
            {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
            {"$set": {"firstSeed": "$firstSeedObject.url"}},
            {"$unset": ["firstSeedObject", "errors"]},
-            {
-                "$lookup": {
-                    "from": "crawl_configs",
-                    "localField": "cid",
-                    "foreignField": "_id",
-                    "as": "crawlConfig",
-                },
-            },
-            {"$set": {"name": {"$arrayElemAt": ["$crawlConfig.name", 0]}}},
        ]

        if not resources:
@ -154,7 +149,12 @@ class CrawlOps(BaseCrawlOps):
            aggregate.extend([{"$match": {"collections": {"$in": [collection_id]}}}])

        if sort_by:
-            if sort_by not in ("started", "finished", "fileSize", "firstSeed"):
+            if sort_by not in (
+                "started",
+                "finished",
+                "fileSize",
+                "firstSeed",
+            ):
                raise HTTPException(status_code=400, detail="invalid_sort_by")
            if sort_direction not in (1, -1):
                raise HTTPException(status_code=400, detail="invalid_sort_direction")
@ -545,6 +545,7 @@ async def add_new_crawl(
        manual=manual,
        started=started,
        tags=crawlconfig.tags,
+        name=crawlconfig.name,
    )

    try:
--- a/backend/btrixcloud/db.py
+++ b/backend/btrixcloud/db.py
@ -15,7 +15,7 @@ from pymongo.errors import InvalidName
 from .migrations import BaseMigration


-CURR_DB_VERSION = "0012"
+CURR_DB_VERSION = "0013"


 # ============================================================================
--- a/backend/btrixcloud/migrations/migration_0013_crawl_name.py
+++ b/backend/btrixcloud/migrations/migration_0013_crawl_name.py
@ -0,0 +1,42 @@
+"""
+Migration 0013 - Copy config name to crawls
+"""
+from btrixcloud.migrations import BaseMigration
+
+
+MIGRATION_VERSION = "0013"
+
+
+class Migration(BaseMigration):
+    """Migration class."""
+
+    def __init__(self, mdb, migration_version=MIGRATION_VERSION):
+        super().__init__(mdb, migration_version)
+
+    async def migrate_up(self):
+        """Perform migration up.
+
+        Copy crawl config names to associated crawls.
+        """
+        # pylint: disable=duplicate-code
+        crawls = self.mdb["crawls"]
+        crawl_configs = self.mdb["crawl_configs"]
+
+        configs = [res async for res in crawl_configs.find({"inactive": {"$ne": True}})]
+        if not configs:
+            return
+
+        for config in configs:
+            config_id = config["_id"]
+            try:
+                if not config.get("name"):
+                    continue
+                await crawls.update_many(
+                    {"cid": config_id}, {"$set": {"name": config.get("name")}}
+                )
+            # pylint: disable=broad-exception-caught
+            except Exception as err:
+                print(
+                    f"Unable to set name for crawls from with config {config_id}: {err}",
+                    flush=True,
+                )
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -296,6 +296,8 @@ class BaseCrawl(BaseMongoModel):
    started: datetime
    finished: Optional[datetime]

+    name: Optional[str]
+
    state: str

    stats: Optional[Dict[str, int]]
@ -368,7 +370,9 @@ class CrawlOutWithResources(CrawlOut):
 class UpdateCrawl(BaseModel):
    """Update crawl"""

-    tags: Optional[List[str]] = []
+    name: Optional[str]
+    description: Optional[str]
+    tags: Optional[List[str]]
    description: Optional[str]


@ -433,7 +437,6 @@ class UploadedCrawl(BaseCrawl):

    type: str = Field("upload", const=True)

-    name: str
    tags: Optional[List[str]] = []


@ -441,8 +444,6 @@ class UploadedCrawl(BaseCrawl):
 class UpdateUpload(UpdateCrawl):
    """Update modal that also includes name"""

-    name: Optional[str]
-

 # ============================================================================

--- a/backend/btrixcloud/uploads.py
+++ b/backend/btrixcloud/uploads.py
@ -299,6 +299,7 @@ def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_d
        org: Organization = Depends(org_viewer_dep),
        pageSize: int = DEFAULT_PAGE_SIZE,
        page: int = 1,
+        state: Optional[str] = None,
        userid: Optional[UUID4] = None,
        name: Optional[str] = None,
        description: Optional[str] = None,
@ -306,9 +307,18 @@ def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_d
        sortBy: Optional[str] = "finished",
        sortDirection: Optional[int] = -1,
    ):
+        states = state.split(",") if state else None
+
+        if name:
+            name = unquote(name)
+
+        if description:
+            description = unquote(description)
+
        uploads, total = await ops.list_all_base_crawls(
            org,
            userid=userid,
+            states=states,
            name=name,
            description=description,
            page_size=pageSize,
--- a/backend/test/conftest.py
+++ b/backend/test/conftest.py
@ -18,6 +18,7 @@ CRAWLER_PW = "crawlerPASSWORD!"
 _admin_config_id = None
 _crawler_config_id = None
 _auto_add_config_id = None
+_all_crawls_config_id = None

 NON_DEFAULT_ORG_NAME = "Non-default org"

@ -118,6 +119,12 @@ def admin_config_id(admin_crawl_id):
    return _admin_config_id


+@pytest.fixture(scope="session")
+def admin_userid(admin_auth_headers):
+    r = requests.get(f"{API_PREFIX}/users/me", headers=admin_auth_headers)
+    return r.json()["id"]
+
+
@pytest.fixture(scope="session")
 def viewer_auth_headers(admin_auth_headers, default_org_id):
    requests.post(
@ -331,6 +338,54 @@ def auto_add_config_id(auto_add_crawl_id):
    return _auto_add_config_id


+@pytest.fixture(scope="session")
+def all_crawls_crawl_id(crawler_auth_headers, default_org_id):
+    # Start crawl.
+    crawl_data = {
+        "runNow": True,
+        "name": "All Crawls Test Crawl",
+        "description": "Lorem ipsum",
+        "config": {
+            "seeds": [{"url": "https://webrecorder.net/"}],
+        },
+    }
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=crawler_auth_headers,
+        json=crawl_data,
+    )
+    data = r.json()
+
+    global _all_crawls_config_id
+    _all_crawls_config_id = data["id"]
+
+    crawl_id = data["run_now_job"]
+    # Wait for it to complete and then return crawl ID
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=crawler_auth_headers,
+        )
+        data = r.json()
+        if data["state"] == "complete":
+            break
+        time.sleep(5)
+
+    # Add description to crawl
+    r = requests.patch(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}",
+        headers=crawler_auth_headers,
+        json={"description": "Lorem ipsum"},
+    )
+    assert r.status_code == 200
+    return crawl_id
+
+
+@pytest.fixture(scope="session")
+def all_crawls_config_id(all_crawls_crawl_id):
+    return _all_crawls_config_id
+
+
@pytest.fixture(scope="session")
 def uploads_collection_id(crawler_auth_headers, default_org_id):
    r = requests.post(
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@ -191,10 +191,11 @@ def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
    # Submit patch request to update tags and description
    UPDATED_TAGS = ["wr-test-1-updated", "wr-test-2-updated"]
    UPDATED_DESC = "Lorem ipsum test note."
+    UPDATED_NAME = "Updated crawl name"
    r = requests.patch(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
        headers=admin_auth_headers,
-        json={"tags": UPDATED_TAGS, "description": UPDATED_DESC},
+        json={"tags": UPDATED_TAGS, "description": UPDATED_DESC, "name": UPDATED_NAME},
    )
    assert r.status_code == 200
    data = r.json()
@ -209,6 +210,7 @@ def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
    data = r.json()
    assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
    assert data["description"] == UPDATED_DESC
+    assert data["name"] == UPDATED_NAME

    # Verify deleting works as well
    r = requests.patch(
--- a/backend/test/test_stop_cancel_crawl.py
+++ b/backend/test/test_stop_cancel_crawl.py
@ -48,7 +48,14 @@ def test_cancel_crawl(default_org_id, crawler_auth_headers):

    data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)

-    while data["state"] in ("running", "waiting_capacity"):
+    while data["state"] in (
+        "starting",
+        "running",
+        "waiting_capacity",
+        "generate-wacz",
+        "uploading-wacz",
+        "pending-wait",
+    ):
        time.sleep(5)
        data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)

@ -88,7 +95,14 @@ def test_start_crawl_and_stop_immediately(
    )
    assert r.json()["lastCrawlStopping"] == True

-    while data["state"] in ("starting", "running", "waiting_capacity"):
+    while data["state"] in (
+        "starting",
+        "running",
+        "waiting_capacity",
+        "generate-wacz",
+        "uploading-wacz",
+        "pending-wait",
+    ):
        time.sleep(5)
        data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)

@ -149,7 +163,12 @@ def test_stop_crawl_partial(
    )
    assert r.json()["lastCrawlStopping"] == True

-    while data["state"] == "running":
+    while data["state"] in (
+        "running",
+        "generate-wacz",
+        "uploading-wacz",
+        "pending-wait",
+    ):
        time.sleep(5)
        data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)

--- a/backend/test/test_uploads.py
+++ b/backend/test/test_uploads.py
@ -9,6 +9,8 @@ upload_id = None
 upload_id_2 = None
 upload_dl_path = None

+_coll_id = None
+

 curr_dir = os.path.dirname(os.path.realpath(__file__))

@ -371,6 +373,275 @@ def test_list_all_crawls(admin_auth_headers, default_org_id):
        assert item["state"]


+def test_get_all_crawls_by_name(admin_auth_headers, default_org_id):
+    """Test filtering /all-crawls by name"""
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?name=test2.wacz",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 1
+    items = data["items"]
+    assert items[0]["id"] == upload_id_2
+    assert items[0]["name"] == "test2.wacz"
+
+    crawl_name = "Crawler User Test Crawl"
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?name={crawl_name}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 3
+    for item in data["items"]:
+        assert item["name"] == crawl_name
+
+
+def test_get_all_crawls_by_first_seed(
+    admin_auth_headers, default_org_id, crawler_crawl_id
+):
+    """Test filtering /all-crawls by first seed"""
+    first_seed = "https://webrecorder.net/"
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?firstSeed={first_seed}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 3
+    for item in data["items"]:
+        assert item["firstSeed"] == first_seed
+
+
+def test_get_all_crawls_by_type(admin_auth_headers, default_org_id, admin_crawl_id):
+    """Test filtering /all-crawls by crawl type"""
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=crawl",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 3
+    for item in data["items"]:
+        assert item["type"] == "crawl"
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=upload",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 3
+    for item in data["items"]:
+        assert item["type"] == "upload"
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=invalid",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 400
+    assert r.json()["detail"] == "invalid_crawl_type"
+
+
+def test_get_all_crawls_by_user(admin_auth_headers, default_org_id, crawler_userid):
+    """Test filtering /all-crawls by userid"""
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?userid={crawler_userid}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 4
+    for item in data["items"]:
+        assert item["userid"] == crawler_userid
+
+
+def test_get_all_crawls_by_cid(
+    admin_auth_headers, default_org_id, all_crawls_config_id
+):
+    """Test filtering /all-crawls by cid"""
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?cid={all_crawls_config_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 1
+    assert data["items"][0]["cid"] == all_crawls_config_id
+
+
+def test_get_all_crawls_by_state(admin_auth_headers, default_org_id, admin_crawl_id):
+    """Test filtering /all-crawls by cid"""
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?state=complete,partial_complete",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 5
+    items = data["items"]
+    for item in items:
+        assert item["state"] in ("complete", "partial_complete")
+
+
+def test_get_all_crawls_by_collection_id(
+    admin_auth_headers, default_org_id, admin_config_id, all_crawls_crawl_id
+):
+    """Test filtering /all-crawls by collection id"""
+    # Create collection and add upload to it
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/collections",
+        headers=admin_auth_headers,
+        json={
+            "crawlIds": [all_crawls_crawl_id],
+            "name": "all-crawls collection",
+        },
+    )
+    assert r.status_code == 200
+    global _coll_id
+    _coll_id = r.json()["id"]
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?collectionId={_coll_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json()["total"] == 1
+    assert r.json()["items"][0]["id"] == all_crawls_crawl_id
+
+
+def test_sort_all_crawls(admin_auth_headers, default_org_id, admin_crawl_id):
+    # Sort by started, descending (default)
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    assert data["total"] == 7
+    items = data["items"]
+    assert len(items) == 7
+
+    last_created = None
+    for crawl in items:
+        if last_created:
+            assert crawl["started"] <= last_created
+        last_created = crawl["started"]
+
+    # Sort by started, ascending
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started&sortDirection=1",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    items = data["items"]
+
+    last_created = None
+    for crawl in items:
+        if last_created:
+            assert crawl["started"] >= last_created
+        last_created = crawl["started"]
+
+    # Sort by finished
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=finished",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    items = data["items"]
+
+    last_finished = None
+    for crawl in items:
+        if not crawl["finished"]:
+            continue
+        if last_finished:
+            assert crawl["finished"] <= last_finished
+        last_finished = crawl["finished"]
+
+    # Sort by finished, ascending
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=finished&sortDirection=1",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    items = data["items"]
+
+    last_finished = None
+    for crawl in items:
+        if not crawl["finished"]:
+            continue
+        if last_finished:
+            assert crawl["finished"] >= last_finished
+        last_finished = crawl["finished"]
+
+    # Sort by fileSize
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=fileSize",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    items = data["items"]
+
+    last_size = None
+    for crawl in items:
+        if last_size:
+            assert crawl["fileSize"] <= last_size
+        last_size = crawl["fileSize"]
+
+    # Sort by fileSize, ascending
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=fileSize&sortDirection=1",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    items = data["items"]
+
+    last_size = None
+    for crawl in items:
+        if last_size:
+            assert crawl["fileSize"] >= last_size
+        last_size = crawl["fileSize"]
+
+    # Invalid sort value
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=invalid",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 400
+    assert r.json()["detail"] == "invalid_sort_by"
+
+    # Invalid sort_direction value
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started&sortDirection=0",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 400
+    assert r.json()["detail"] == "invalid_sort_direction"
+
+
+def test_all_crawls_search_values(admin_auth_headers, default_org_id):
+    """Test that all-crawls search values return expected results"""
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+
+    assert len(data["names"]) == 5
+    expected_names = [
+        "Crawler User Test Crawl",
+        "My Upload Updated",
+        "test2.wacz",
+        "All Crawls Test Crawl",
+    ]
+    for expected_name in expected_names:
+        assert expected_name in data["names"]
+
+    assert sorted(data["descriptions"]) == ["Lorem ipsum"]
+    assert sorted(data["firstSeeds"]) == ["https://webrecorder.net/"]
+    assert len(data["crawlIds"]) == 7
+
+
 def test_get_upload_from_all_crawls(admin_auth_headers, default_org_id):
    """Test that /all-crawls lists crawls and uploads before deleting uploads"""
    r = requests.get(
--- a/backend/test_nightly/test_concurrent_crawl_limit.py
+++ b/backend/test_nightly/test_concurrent_crawl_limit.py
@ -31,8 +31,11 @@ def test_run_two_only_one_concurrent(org_with_quotas, admin_auth_headers):
    ):
        time.sleep(2)

-    assert (
-        get_crawl_status(org_with_quotas, crawl_id_a, admin_auth_headers) == "running"
+    assert get_crawl_status(org_with_quotas, crawl_id_a, admin_auth_headers) in (
+        "running",
+        "generate-wacz",
+        "uploading-wacz",
+        "pending-wait",
    )

    while (
@ -68,6 +71,10 @@ def test_cancel_and_run_other(org_with_quotas, admin_auth_headers):
    assert get_crawl_status(org_with_quotas, crawl_id_b, admin_auth_headers) in (
        "starting",
        "running",
+        "waiting_capacity",
+        "generate-wacz",
+        "uploading-wacz",
+        "pending-wait",
    )

    # cancel second crawl as well