From 7ff57ce6b5207f3d73afcd41ad24f8983a85f4ae Mon Sep 17 00:00:00 2001
From: Tessa Walsh <tessa@bitarchivist.net>
Date: Fri, 4 Aug 2023 12:56:52 -0400
Subject: [PATCH] Backend: standardize search values, filters, and sorting for
 archived items (#1039)

- all-crawls list endpoint filters now conform to 'Standardize list controls for archived items #1025' and URL decode values before passing them in
- Uploads list endpoint now includes all all-crawls filters relevant to uploads
- An all-crawls/search-values endpoint is added to support searching across all archived item types
- Crawl configuration names are now copied to the crawl when the crawl is created, and crawl names and descriptions are now editable via the backend API (note: this will require frontend changes as well to make them editable via the UI)
- Migration added to copy existing config names for active configs into their associated crawls. This migration has been tested in a local deployment
- New statuses generate-wacz, uploading-wacz, and pending-wait are added when relevant to tests to ensure that they pass
- Tests coverage added for all new all-crawls endpoints, filters, and sort values
---
 backend/btrixcloud/basecrawls.py              |  91 +++++-
 backend/btrixcloud/crawls.py                  |  21 +-
 backend/btrixcloud/db.py                      |   2 +-
 .../migrations/migration_0013_crawl_name.py   |  42 +++
 backend/btrixcloud/models.py                  |   9 +-
 backend/btrixcloud/uploads.py                 |  10 +
 backend/test/conftest.py                      |  55 ++++
 backend/test/test_run_crawl.py                |   4 +-
 backend/test/test_stop_cancel_crawl.py        |  25 +-
 backend/test/test_uploads.py                  | 271 ++++++++++++++++++
 .../test_concurrent_crawl_limit.py            |  11 +-
 11 files changed, 505 insertions(+), 36 deletions(-)
 create mode 100644 backend/btrixcloud/migrations/migration_0013_crawl_name.py

diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py
index bd626cde..1b484c61 100644
--- a/backend/btrixcloud/basecrawls.py
+++ b/backend/btrixcloud/basecrawls.py
@@ -5,6 +5,7 @@ import uuid
 import os
 from datetime import timedelta
 from typing import Optional, List, Union
+import urllib.parse
 
 from pydantic import UUID4
 from fastapi import HTTPException, Depends
@@ -196,16 +197,11 @@ class BaseCrawlOps:
         config = await self.crawl_configs.get_crawl_config(
             crawl.cid, org, active_only=False
         )
-
-        if config:
-            if not crawl.name:
-                crawl.name = config.name
-
-            if config.config.seeds:
-                if add_first_seed:
-                    first_seed = config.config.seeds[0]
-                    crawl.firstSeed = first_seed.url
-                crawl.seedCount = len(config.config.seeds)
+        if config and config.config.seeds:
+            if add_first_seed:
+                first_seed = config.config.seeds[0]
+                crawl.firstSeed = first_seed.url
+            crawl.seedCount = len(config.config.seeds)
 
         if hasattr(crawl, "profileid") and crawl.profileid:
             crawl.profileName = await self.crawl_configs.profiles.get_profile_name(
@@ -327,7 +323,7 @@ class BaseCrawlOps:
             {"$pull": {"collections": collection_id}},
         )
 
-    # pylint: disable=too-many-branches
+    # pylint: disable=too-many-branches, invalid-name
     async def list_all_base_crawls(
         self,
         org: Optional[Organization] = None,
@@ -336,12 +332,14 @@ class BaseCrawlOps:
         description: str = None,
         collection_id: str = None,
         states: Optional[List[str]] = None,
+        first_seed: Optional[str] = None,
+        type_: Optional[str] = None,
+        cid: Optional[UUID4] = None,
         cls_type: Union[CrawlOut, CrawlOutWithResources] = CrawlOut,
         page_size: int = DEFAULT_PAGE_SIZE,
         page: int = 1,
         sort_by: str = None,
         sort_direction: int = -1,
-        type_=None,
     ):
         """List crawls of all types from the db"""
         # Zero-index page for query
@@ -367,7 +365,15 @@ class BaseCrawlOps:
             # validated_states = [value for value in state if value in ALL_CRAWL_STATES]
             query["state"] = {"$in": states}
 
-        aggregate = [{"$match": query}, {"$unset": "errors"}]
+        if cid:
+            query["cid"] = cid
+
+        aggregate = [
+            {"$match": query},
+            {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
+            {"$set": {"firstSeed": "$firstSeedObject.url"}},
+            {"$unset": ["firstSeedObject", "errors"]},
+        ]
 
         if not resources:
             aggregate.extend([{"$unset": ["files"]}])
@@ -375,6 +381,9 @@ class BaseCrawlOps:
         if name:
             aggregate.extend([{"$match": {"name": name}}])
 
+        if first_seed:
+            aggregate.extend([{"$match": {"firstSeed": first_seed}}])
+
         if description:
             aggregate.extend([{"$match": {"description": description}}])
 
@@ -382,7 +391,7 @@ class BaseCrawlOps:
             aggregate.extend([{"$match": {"collections": {"$in": [collection_id]}}}])
 
         if sort_by:
-            if sort_by not in ("started", "finished"):
+            if sort_by not in ("started", "finished", "fileSize"):
                 raise HTTPException(status_code=400, detail="invalid_sort_by")
             if sort_direction not in (1, -1):
                 raise HTTPException(status_code=400, detail="invalid_sort_direction")
@@ -447,13 +456,40 @@ class BaseCrawlOps:
 
         return {"deleted": True}
 
+    async def get_all_crawl_search_values(self, org: Organization):
+        """List unique names, first seeds, and descriptions from all captures in org"""
+        names = await self.crawls.distinct("name", {"oid": org.id})
+        descriptions = await self.crawls.distinct("description", {"oid": org.id})
+        crawl_ids = await self.crawls.distinct("_id", {"oid": org.id})
+        cids = await self.crawls.distinct("cid", {"oid": org.id})
+
+        # Remove empty strings
+        names = [name for name in names if name]
+        descriptions = [description for description in descriptions if description]
+
+        # Get first seeds
+        first_seeds = set()
+        for cid in cids:
+            if not cid:
+                continue
+            config = await self.crawl_configs.get_crawl_config(cid, org)
+            first_seed = config.config.seeds[0]
+            first_seeds.add(first_seed.url)
+
+        return {
+            "names": names,
+            "descriptions": descriptions,
+            "firstSeeds": list(first_seeds),
+            "crawlIds": list(crawl_ids),
+        }
+
 
 # ============================================================================
 def init_base_crawls_api(
     app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep
 ):
     """base crawls api"""
-    # pylint: disable=invalid-name, duplicate-code, too-many-arguments
+    # pylint: disable=invalid-name, duplicate-code, too-many-arguments, too-many-locals
 
     ops = BaseCrawlOps(mdb, users, crawl_config_ops, crawl_manager)
 
@@ -472,12 +508,28 @@ def init_base_crawls_api(
         userid: Optional[UUID4] = None,
         name: Optional[str] = None,
         state: Optional[str] = None,
+        firstSeed: Optional[str] = None,
         description: Optional[str] = None,
         collectionId: Optional[UUID4] = None,
+        crawlType: Optional[str] = None,
+        cid: Optional[UUID4] = None,
         sortBy: Optional[str] = "finished",
         sortDirection: Optional[int] = -1,
     ):
         states = state.split(",") if state else None
+
+        if firstSeed:
+            firstSeed = urllib.parse.unquote(firstSeed)
+
+        if name:
+            name = urllib.parse.unquote(name)
+
+        if description:
+            description = urllib.parse.unquote(description)
+
+        if crawlType and crawlType not in ("crawl", "upload"):
+            raise HTTPException(status_code=400, detail="invalid_crawl_type")
+
         crawls, total = await ops.list_all_base_crawls(
             org,
             userid=userid,
@@ -485,6 +537,9 @@ def init_base_crawls_api(
             description=description,
             collection_id=collectionId,
             states=states,
+            first_seed=firstSeed,
+            type_=crawlType,
+            cid=cid,
             page_size=pageSize,
             page=page,
             sort_by=sortBy,
@@ -492,6 +547,12 @@ def init_base_crawls_api(
         )
         return paginated_format(crawls, total, page, pageSize)
 
+    @app.get("/orgs/{oid}/all-crawls/search-values", tags=["all-crawls"])
+    async def get_all_crawls_search_values(
+        org: Organization = Depends(org_viewer_dep),
+    ):
+        return await ops.get_all_crawl_search_values(org)
+
     @app.get(
         "/orgs/{oid}/all-crawls/{crawl_id}",
         tags=["all-crawls"],
diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
index 30a2df9a..2ed4c5a7 100644
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@@ -68,11 +68,15 @@ class CrawlOps(BaseCrawlOps):
         await self.crawls.create_index(
             [("type", pymongo.HASHED), ("state", pymongo.DESCENDING)]
         )
+        await self.crawls.create_index(
+            [("type", pymongo.HASHED), ("fileSize", pymongo.DESCENDING)]
+        )
 
         await self.crawls.create_index([("finished", pymongo.DESCENDING)])
         await self.crawls.create_index([("oid", pymongo.HASHED)])
         await self.crawls.create_index([("cid", pymongo.HASHED)])
         await self.crawls.create_index([("state", pymongo.HASHED)])
+        await self.crawls.create_index([("fileSize", pymongo.DESCENDING)])
 
     async def list_crawls(
         self,
@@ -127,15 +131,6 @@ class CrawlOps(BaseCrawlOps):
             {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
             {"$set": {"firstSeed": "$firstSeedObject.url"}},
             {"$unset": ["firstSeedObject", "errors"]},
-            {
-                "$lookup": {
-                    "from": "crawl_configs",
-                    "localField": "cid",
-                    "foreignField": "_id",
-                    "as": "crawlConfig",
-                },
-            },
-            {"$set": {"name": {"$arrayElemAt": ["$crawlConfig.name", 0]}}},
         ]
 
         if not resources:
@@ -154,7 +149,12 @@ class CrawlOps(BaseCrawlOps):
             aggregate.extend([{"$match": {"collections": {"$in": [collection_id]}}}])
 
         if sort_by:
-            if sort_by not in ("started", "finished", "fileSize", "firstSeed"):
+            if sort_by not in (
+                "started",
+                "finished",
+                "fileSize",
+                "firstSeed",
+            ):
                 raise HTTPException(status_code=400, detail="invalid_sort_by")
             if sort_direction not in (1, -1):
                 raise HTTPException(status_code=400, detail="invalid_sort_direction")
@@ -545,6 +545,7 @@ async def add_new_crawl(
         manual=manual,
         started=started,
         tags=crawlconfig.tags,
+        name=crawlconfig.name,
     )
 
     try:
diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py
index 8151445b..be63c140 100644
--- a/backend/btrixcloud/db.py
+++ b/backend/btrixcloud/db.py
@@ -15,7 +15,7 @@ from pymongo.errors import InvalidName
 from .migrations import BaseMigration
 
 
-CURR_DB_VERSION = "0012"
+CURR_DB_VERSION = "0013"
 
 
 # ============================================================================
diff --git a/backend/btrixcloud/migrations/migration_0013_crawl_name.py b/backend/btrixcloud/migrations/migration_0013_crawl_name.py
new file mode 100644
index 00000000..9b511101
--- /dev/null
+++ b/backend/btrixcloud/migrations/migration_0013_crawl_name.py
@@ -0,0 +1,42 @@
+"""
+Migration 0013 - Copy config name to crawls
+"""
+from btrixcloud.migrations import BaseMigration
+
+
+MIGRATION_VERSION = "0013"
+
+
+class Migration(BaseMigration):
+    """Migration class."""
+
+    def __init__(self, mdb, migration_version=MIGRATION_VERSION):
+        super().__init__(mdb, migration_version)
+
+    async def migrate_up(self):
+        """Perform migration up.
+
+        Copy crawl config names to associated crawls.
+        """
+        # pylint: disable=duplicate-code
+        crawls = self.mdb["crawls"]
+        crawl_configs = self.mdb["crawl_configs"]
+
+        configs = [res async for res in crawl_configs.find({"inactive": {"$ne": True}})]
+        if not configs:
+            return
+
+        for config in configs:
+            config_id = config["_id"]
+            try:
+                if not config.get("name"):
+                    continue
+                await crawls.update_many(
+                    {"cid": config_id}, {"$set": {"name": config.get("name")}}
+                )
+            # pylint: disable=broad-exception-caught
+            except Exception as err:
+                print(
+                    f"Unable to set name for crawls from with config {config_id}: {err}",
+                    flush=True,
+                )
diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
index 69b005eb..87839900 100644
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@@ -296,6 +296,8 @@ class BaseCrawl(BaseMongoModel):
     started: datetime
     finished: Optional[datetime]
 
+    name: Optional[str]
+
     state: str
 
     stats: Optional[Dict[str, int]]
@@ -368,7 +370,9 @@ class CrawlOutWithResources(CrawlOut):
 class UpdateCrawl(BaseModel):
     """Update crawl"""
 
-    tags: Optional[List[str]] = []
+    name: Optional[str]
+    description: Optional[str]
+    tags: Optional[List[str]]
     description: Optional[str]
 
 
@@ -433,7 +437,6 @@ class UploadedCrawl(BaseCrawl):
 
     type: str = Field("upload", const=True)
 
-    name: str
     tags: Optional[List[str]] = []
 
 
@@ -441,8 +444,6 @@ class UploadedCrawl(BaseCrawl):
 class UpdateUpload(UpdateCrawl):
     """Update modal that also includes name"""
 
-    name: Optional[str]
-
 
 # ============================================================================
 
diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py
index a22e41ba..d4313a45 100644
--- a/backend/btrixcloud/uploads.py
+++ b/backend/btrixcloud/uploads.py
@@ -299,6 +299,7 @@ def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_d
         org: Organization = Depends(org_viewer_dep),
         pageSize: int = DEFAULT_PAGE_SIZE,
         page: int = 1,
+        state: Optional[str] = None,
         userid: Optional[UUID4] = None,
         name: Optional[str] = None,
         description: Optional[str] = None,
@@ -306,9 +307,18 @@ def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_d
         sortBy: Optional[str] = "finished",
         sortDirection: Optional[int] = -1,
     ):
+        states = state.split(",") if state else None
+
+        if name:
+            name = unquote(name)
+
+        if description:
+            description = unquote(description)
+
         uploads, total = await ops.list_all_base_crawls(
             org,
             userid=userid,
+            states=states,
             name=name,
             description=description,
             page_size=pageSize,
diff --git a/backend/test/conftest.py b/backend/test/conftest.py
index 25b75d9e..a52f6297 100644
--- a/backend/test/conftest.py
+++ b/backend/test/conftest.py
@@ -18,6 +18,7 @@ CRAWLER_PW = "crawlerPASSWORD!"
 _admin_config_id = None
 _crawler_config_id = None
 _auto_add_config_id = None
+_all_crawls_config_id = None
 
 NON_DEFAULT_ORG_NAME = "Non-default org"
 
@@ -118,6 +119,12 @@ def admin_config_id(admin_crawl_id):
     return _admin_config_id
 
 
+@pytest.fixture(scope="session")
+def admin_userid(admin_auth_headers):
+    r = requests.get(f"{API_PREFIX}/users/me", headers=admin_auth_headers)
+    return r.json()["id"]
+
+
 @pytest.fixture(scope="session")
 def viewer_auth_headers(admin_auth_headers, default_org_id):
     requests.post(
@@ -331,6 +338,54 @@ def auto_add_config_id(auto_add_crawl_id):
     return _auto_add_config_id
 
 
+@pytest.fixture(scope="session")
+def all_crawls_crawl_id(crawler_auth_headers, default_org_id):
+    # Start crawl.
+    crawl_data = {
+        "runNow": True,
+        "name": "All Crawls Test Crawl",
+        "description": "Lorem ipsum",
+        "config": {
+            "seeds": [{"url": "https://webrecorder.net/"}],
+        },
+    }
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=crawler_auth_headers,
+        json=crawl_data,
+    )
+    data = r.json()
+
+    global _all_crawls_config_id
+    _all_crawls_config_id = data["id"]
+
+    crawl_id = data["run_now_job"]
+    # Wait for it to complete and then return crawl ID
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=crawler_auth_headers,
+        )
+        data = r.json()
+        if data["state"] == "complete":
+            break
+        time.sleep(5)
+
+    # Add description to crawl
+    r = requests.patch(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}",
+        headers=crawler_auth_headers,
+        json={"description": "Lorem ipsum"},
+    )
+    assert r.status_code == 200
+    return crawl_id
+
+
+@pytest.fixture(scope="session")
+def all_crawls_config_id(all_crawls_crawl_id):
+    return _all_crawls_config_id
+
+
 @pytest.fixture(scope="session")
 def uploads_collection_id(crawler_auth_headers, default_org_id):
     r = requests.post(
diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py
index e4fb89f1..2c168023 100644
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@@ -191,10 +191,11 @@ def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
     # Submit patch request to update tags and description
     UPDATED_TAGS = ["wr-test-1-updated", "wr-test-2-updated"]
     UPDATED_DESC = "Lorem ipsum test note."
+    UPDATED_NAME = "Updated crawl name"
     r = requests.patch(
         f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
         headers=admin_auth_headers,
-        json={"tags": UPDATED_TAGS, "description": UPDATED_DESC},
+        json={"tags": UPDATED_TAGS, "description": UPDATED_DESC, "name": UPDATED_NAME},
     )
     assert r.status_code == 200
     data = r.json()
@@ -209,6 +210,7 @@ def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
     data = r.json()
     assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
     assert data["description"] == UPDATED_DESC
+    assert data["name"] == UPDATED_NAME
 
     # Verify deleting works as well
     r = requests.patch(
diff --git a/backend/test/test_stop_cancel_crawl.py b/backend/test/test_stop_cancel_crawl.py
index 225840fd..bbc114be 100644
--- a/backend/test/test_stop_cancel_crawl.py
+++ b/backend/test/test_stop_cancel_crawl.py
@@ -48,7 +48,14 @@ def test_cancel_crawl(default_org_id, crawler_auth_headers):
 
     data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
 
-    while data["state"] in ("running", "waiting_capacity"):
+    while data["state"] in (
+        "starting",
+        "running",
+        "waiting_capacity",
+        "generate-wacz",
+        "uploading-wacz",
+        "pending-wait",
+    ):
         time.sleep(5)
         data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
 
@@ -88,7 +95,14 @@ def test_start_crawl_and_stop_immediately(
     )
     assert r.json()["lastCrawlStopping"] == True
 
-    while data["state"] in ("starting", "running", "waiting_capacity"):
+    while data["state"] in (
+        "starting",
+        "running",
+        "waiting_capacity",
+        "generate-wacz",
+        "uploading-wacz",
+        "pending-wait",
+    ):
         time.sleep(5)
         data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
 
@@ -149,7 +163,12 @@ def test_stop_crawl_partial(
     )
     assert r.json()["lastCrawlStopping"] == True
 
-    while data["state"] == "running":
+    while data["state"] in (
+        "running",
+        "generate-wacz",
+        "uploading-wacz",
+        "pending-wait",
+    ):
         time.sleep(5)
         data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
 
diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py
index 2a46206a..db13cf9d 100644
--- a/backend/test/test_uploads.py
+++ b/backend/test/test_uploads.py
@@ -9,6 +9,8 @@ upload_id = None
 upload_id_2 = None
 upload_dl_path = None
 
+_coll_id = None
+
 
 curr_dir = os.path.dirname(os.path.realpath(__file__))
 
@@ -371,6 +373,275 @@ def test_list_all_crawls(admin_auth_headers, default_org_id):
         assert item["state"]
 
 
+def test_get_all_crawls_by_name(admin_auth_headers, default_org_id):
+    """Test filtering /all-crawls by name"""
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?name=test2.wacz",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 1
+    items = data["items"]
+    assert items[0]["id"] == upload_id_2
+    assert items[0]["name"] == "test2.wacz"
+
+    crawl_name = "Crawler User Test Crawl"
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?name={crawl_name}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 3
+    for item in data["items"]:
+        assert item["name"] == crawl_name
+
+
+def test_get_all_crawls_by_first_seed(
+    admin_auth_headers, default_org_id, crawler_crawl_id
+):
+    """Test filtering /all-crawls by first seed"""
+    first_seed = "https://webrecorder.net/"
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?firstSeed={first_seed}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 3
+    for item in data["items"]:
+        assert item["firstSeed"] == first_seed
+
+
+def test_get_all_crawls_by_type(admin_auth_headers, default_org_id, admin_crawl_id):
+    """Test filtering /all-crawls by crawl type"""
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=crawl",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 3
+    for item in data["items"]:
+        assert item["type"] == "crawl"
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=upload",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 3
+    for item in data["items"]:
+        assert item["type"] == "upload"
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=invalid",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 400
+    assert r.json()["detail"] == "invalid_crawl_type"
+
+
+def test_get_all_crawls_by_user(admin_auth_headers, default_org_id, crawler_userid):
+    """Test filtering /all-crawls by userid"""
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?userid={crawler_userid}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 4
+    for item in data["items"]:
+        assert item["userid"] == crawler_userid
+
+
+def test_get_all_crawls_by_cid(
+    admin_auth_headers, default_org_id, all_crawls_config_id
+):
+    """Test filtering /all-crawls by cid"""
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?cid={all_crawls_config_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 1
+    assert data["items"][0]["cid"] == all_crawls_config_id
+
+
+def test_get_all_crawls_by_state(admin_auth_headers, default_org_id, admin_crawl_id):
+    """Test filtering /all-crawls by cid"""
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?state=complete,partial_complete",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] == 5
+    items = data["items"]
+    for item in items:
+        assert item["state"] in ("complete", "partial_complete")
+
+
+def test_get_all_crawls_by_collection_id(
+    admin_auth_headers, default_org_id, admin_config_id, all_crawls_crawl_id
+):
+    """Test filtering /all-crawls by collection id"""
+    # Create collection and add upload to it
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/collections",
+        headers=admin_auth_headers,
+        json={
+            "crawlIds": [all_crawls_crawl_id],
+            "name": "all-crawls collection",
+        },
+    )
+    assert r.status_code == 200
+    global _coll_id
+    _coll_id = r.json()["id"]
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?collectionId={_coll_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json()["total"] == 1
+    assert r.json()["items"][0]["id"] == all_crawls_crawl_id
+
+
+def test_sort_all_crawls(admin_auth_headers, default_org_id, admin_crawl_id):
+    # Sort by started, descending (default)
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    assert data["total"] == 7
+    items = data["items"]
+    assert len(items) == 7
+
+    last_created = None
+    for crawl in items:
+        if last_created:
+            assert crawl["started"] <= last_created
+        last_created = crawl["started"]
+
+    # Sort by started, ascending
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started&sortDirection=1",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    items = data["items"]
+
+    last_created = None
+    for crawl in items:
+        if last_created:
+            assert crawl["started"] >= last_created
+        last_created = crawl["started"]
+
+    # Sort by finished
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=finished",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    items = data["items"]
+
+    last_finished = None
+    for crawl in items:
+        if not crawl["finished"]:
+            continue
+        if last_finished:
+            assert crawl["finished"] <= last_finished
+        last_finished = crawl["finished"]
+
+    # Sort by finished, ascending
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=finished&sortDirection=1",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    items = data["items"]
+
+    last_finished = None
+    for crawl in items:
+        if not crawl["finished"]:
+            continue
+        if last_finished:
+            assert crawl["finished"] >= last_finished
+        last_finished = crawl["finished"]
+
+    # Sort by fileSize
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=fileSize",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    items = data["items"]
+
+    last_size = None
+    for crawl in items:
+        if last_size:
+            assert crawl["fileSize"] <= last_size
+        last_size = crawl["fileSize"]
+
+    # Sort by fileSize, ascending
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=fileSize&sortDirection=1",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    items = data["items"]
+
+    last_size = None
+    for crawl in items:
+        if last_size:
+            assert crawl["fileSize"] >= last_size
+        last_size = crawl["fileSize"]
+
+    # Invalid sort value
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=invalid",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 400
+    assert r.json()["detail"] == "invalid_sort_by"
+
+    # Invalid sort_direction value
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started&sortDirection=0",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 400
+    assert r.json()["detail"] == "invalid_sort_direction"
+
+
+def test_all_crawls_search_values(admin_auth_headers, default_org_id):
+    """Test that all-crawls search values return expected results"""
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+
+    assert len(data["names"]) == 5
+    expected_names = [
+        "Crawler User Test Crawl",
+        "My Upload Updated",
+        "test2.wacz",
+        "All Crawls Test Crawl",
+    ]
+    for expected_name in expected_names:
+        assert expected_name in data["names"]
+
+    assert sorted(data["descriptions"]) == ["Lorem ipsum"]
+    assert sorted(data["firstSeeds"]) == ["https://webrecorder.net/"]
+    assert len(data["crawlIds"]) == 7
+
+
 def test_get_upload_from_all_crawls(admin_auth_headers, default_org_id):
     """Test that /all-crawls lists crawls and uploads before deleting uploads"""
     r = requests.get(
diff --git a/backend/test_nightly/test_concurrent_crawl_limit.py b/backend/test_nightly/test_concurrent_crawl_limit.py
index 3f0e680c..7f6ee3d2 100644
--- a/backend/test_nightly/test_concurrent_crawl_limit.py
+++ b/backend/test_nightly/test_concurrent_crawl_limit.py
@@ -31,8 +31,11 @@ def test_run_two_only_one_concurrent(org_with_quotas, admin_auth_headers):
     ):
         time.sleep(2)
 
-    assert (
-        get_crawl_status(org_with_quotas, crawl_id_a, admin_auth_headers) == "running"
+    assert get_crawl_status(org_with_quotas, crawl_id_a, admin_auth_headers) in (
+        "running",
+        "generate-wacz",
+        "uploading-wacz",
+        "pending-wait",
     )
 
     while (
@@ -68,6 +71,10 @@ def test_cancel_and_run_other(org_with_quotas, admin_auth_headers):
     assert get_crawl_status(org_with_quotas, crawl_id_b, admin_auth_headers) in (
         "starting",
         "running",
+        "waiting_capacity",
+        "generate-wacz",
+        "uploading-wacz",
+        "pending-wait",
     )
 
     # cancel second crawl as well