From 7ff57ce6b5207f3d73afcd41ad24f8983a85f4ae Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 4 Aug 2023 12:56:52 -0400 Subject: [PATCH] Backend: standardize search values, filters, and sorting for archived items (#1039) - all-crawls list endpoint filters now conform to 'Standardize list controls for archived items #1025' and URL decode values before passing them in - Uploads list endpoint now includes all all-crawls filters relevant to uploads - An all-crawls/search-values endpoint is added to support searching across all archived item types - Crawl configuration names are now copied to the crawl when the crawl is created, and crawl names and descriptions are now editable via the backend API (note: this will require frontend changes as well to make them editable via the UI) - Migration added to copy existing config names for active configs into their associated crawls. This migration has been tested in a local deployment - New statuses generate-wacz, uploading-wacz, and pending-wait are added when relevant to tests to ensure that they pass - Tests coverage added for all new all-crawls endpoints, filters, and sort values --- backend/btrixcloud/basecrawls.py | 91 +++++- backend/btrixcloud/crawls.py | 21 +- backend/btrixcloud/db.py | 2 +- .../migrations/migration_0013_crawl_name.py | 42 +++ backend/btrixcloud/models.py | 9 +- backend/btrixcloud/uploads.py | 10 + backend/test/conftest.py | 55 ++++ backend/test/test_run_crawl.py | 4 +- backend/test/test_stop_cancel_crawl.py | 25 +- backend/test/test_uploads.py | 271 ++++++++++++++++++ .../test_concurrent_crawl_limit.py | 11 +- 11 files changed, 505 insertions(+), 36 deletions(-) create mode 100644 backend/btrixcloud/migrations/migration_0013_crawl_name.py diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index bd626cde..1b484c61 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -5,6 +5,7 @@ import uuid import os from datetime import timedelta from typing import Optional, List, Union +import urllib.parse from pydantic import UUID4 from fastapi import HTTPException, Depends @@ -196,16 +197,11 @@ class BaseCrawlOps: config = await self.crawl_configs.get_crawl_config( crawl.cid, org, active_only=False ) - - if config: - if not crawl.name: - crawl.name = config.name - - if config.config.seeds: - if add_first_seed: - first_seed = config.config.seeds[0] - crawl.firstSeed = first_seed.url - crawl.seedCount = len(config.config.seeds) + if config and config.config.seeds: + if add_first_seed: + first_seed = config.config.seeds[0] + crawl.firstSeed = first_seed.url + crawl.seedCount = len(config.config.seeds) if hasattr(crawl, "profileid") and crawl.profileid: crawl.profileName = await self.crawl_configs.profiles.get_profile_name( @@ -327,7 +323,7 @@ class BaseCrawlOps: {"$pull": {"collections": collection_id}}, ) - # pylint: disable=too-many-branches + # pylint: disable=too-many-branches, invalid-name async def list_all_base_crawls( self, org: Optional[Organization] = None, @@ -336,12 +332,14 @@ class BaseCrawlOps: description: str = None, collection_id: str = None, states: Optional[List[str]] = None, + first_seed: Optional[str] = None, + type_: Optional[str] = None, + cid: Optional[UUID4] = None, cls_type: Union[CrawlOut, CrawlOutWithResources] = CrawlOut, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, sort_by: str = None, sort_direction: int = -1, - type_=None, ): """List crawls of all types from the db""" # Zero-index page for query @@ -367,7 +365,15 @@ class BaseCrawlOps: # validated_states = [value for value in state if value in ALL_CRAWL_STATES] query["state"] = {"$in": states} - aggregate = [{"$match": query}, {"$unset": "errors"}] + if cid: + query["cid"] = cid + + aggregate = [ + {"$match": query}, + {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}}, + {"$set": {"firstSeed": "$firstSeedObject.url"}}, + {"$unset": ["firstSeedObject", "errors"]}, + ] if not resources: aggregate.extend([{"$unset": ["files"]}]) @@ -375,6 +381,9 @@ class BaseCrawlOps: if name: aggregate.extend([{"$match": {"name": name}}]) + if first_seed: + aggregate.extend([{"$match": {"firstSeed": first_seed}}]) + if description: aggregate.extend([{"$match": {"description": description}}]) @@ -382,7 +391,7 @@ class BaseCrawlOps: aggregate.extend([{"$match": {"collections": {"$in": [collection_id]}}}]) if sort_by: - if sort_by not in ("started", "finished"): + if sort_by not in ("started", "finished", "fileSize"): raise HTTPException(status_code=400, detail="invalid_sort_by") if sort_direction not in (1, -1): raise HTTPException(status_code=400, detail="invalid_sort_direction") @@ -447,13 +456,40 @@ class BaseCrawlOps: return {"deleted": True} + async def get_all_crawl_search_values(self, org: Organization): + """List unique names, first seeds, and descriptions from all captures in org""" + names = await self.crawls.distinct("name", {"oid": org.id}) + descriptions = await self.crawls.distinct("description", {"oid": org.id}) + crawl_ids = await self.crawls.distinct("_id", {"oid": org.id}) + cids = await self.crawls.distinct("cid", {"oid": org.id}) + + # Remove empty strings + names = [name for name in names if name] + descriptions = [description for description in descriptions if description] + + # Get first seeds + first_seeds = set() + for cid in cids: + if not cid: + continue + config = await self.crawl_configs.get_crawl_config(cid, org) + first_seed = config.config.seeds[0] + first_seeds.add(first_seed.url) + + return { + "names": names, + "descriptions": descriptions, + "firstSeeds": list(first_seeds), + "crawlIds": list(crawl_ids), + } + # ============================================================================ def init_base_crawls_api( app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep ): """base crawls api""" - # pylint: disable=invalid-name, duplicate-code, too-many-arguments + # pylint: disable=invalid-name, duplicate-code, too-many-arguments, too-many-locals ops = BaseCrawlOps(mdb, users, crawl_config_ops, crawl_manager) @@ -472,12 +508,28 @@ def init_base_crawls_api( userid: Optional[UUID4] = None, name: Optional[str] = None, state: Optional[str] = None, + firstSeed: Optional[str] = None, description: Optional[str] = None, collectionId: Optional[UUID4] = None, + crawlType: Optional[str] = None, + cid: Optional[UUID4] = None, sortBy: Optional[str] = "finished", sortDirection: Optional[int] = -1, ): states = state.split(",") if state else None + + if firstSeed: + firstSeed = urllib.parse.unquote(firstSeed) + + if name: + name = urllib.parse.unquote(name) + + if description: + description = urllib.parse.unquote(description) + + if crawlType and crawlType not in ("crawl", "upload"): + raise HTTPException(status_code=400, detail="invalid_crawl_type") + crawls, total = await ops.list_all_base_crawls( org, userid=userid, @@ -485,6 +537,9 @@ def init_base_crawls_api( description=description, collection_id=collectionId, states=states, + first_seed=firstSeed, + type_=crawlType, + cid=cid, page_size=pageSize, page=page, sort_by=sortBy, @@ -492,6 +547,12 @@ def init_base_crawls_api( ) return paginated_format(crawls, total, page, pageSize) + @app.get("/orgs/{oid}/all-crawls/search-values", tags=["all-crawls"]) + async def get_all_crawls_search_values( + org: Organization = Depends(org_viewer_dep), + ): + return await ops.get_all_crawl_search_values(org) + @app.get( "/orgs/{oid}/all-crawls/{crawl_id}", tags=["all-crawls"], diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 30a2df9a..2ed4c5a7 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -68,11 +68,15 @@ class CrawlOps(BaseCrawlOps): await self.crawls.create_index( [("type", pymongo.HASHED), ("state", pymongo.DESCENDING)] ) + await self.crawls.create_index( + [("type", pymongo.HASHED), ("fileSize", pymongo.DESCENDING)] + ) await self.crawls.create_index([("finished", pymongo.DESCENDING)]) await self.crawls.create_index([("oid", pymongo.HASHED)]) await self.crawls.create_index([("cid", pymongo.HASHED)]) await self.crawls.create_index([("state", pymongo.HASHED)]) + await self.crawls.create_index([("fileSize", pymongo.DESCENDING)]) async def list_crawls( self, @@ -127,15 +131,6 @@ class CrawlOps(BaseCrawlOps): {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}}, {"$set": {"firstSeed": "$firstSeedObject.url"}}, {"$unset": ["firstSeedObject", "errors"]}, - { - "$lookup": { - "from": "crawl_configs", - "localField": "cid", - "foreignField": "_id", - "as": "crawlConfig", - }, - }, - {"$set": {"name": {"$arrayElemAt": ["$crawlConfig.name", 0]}}}, ] if not resources: @@ -154,7 +149,12 @@ class CrawlOps(BaseCrawlOps): aggregate.extend([{"$match": {"collections": {"$in": [collection_id]}}}]) if sort_by: - if sort_by not in ("started", "finished", "fileSize", "firstSeed"): + if sort_by not in ( + "started", + "finished", + "fileSize", + "firstSeed", + ): raise HTTPException(status_code=400, detail="invalid_sort_by") if sort_direction not in (1, -1): raise HTTPException(status_code=400, detail="invalid_sort_direction") @@ -545,6 +545,7 @@ async def add_new_crawl( manual=manual, started=started, tags=crawlconfig.tags, + name=crawlconfig.name, ) try: diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index 8151445b..be63c140 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -15,7 +15,7 @@ from pymongo.errors import InvalidName from .migrations import BaseMigration -CURR_DB_VERSION = "0012" +CURR_DB_VERSION = "0013" # ============================================================================ diff --git a/backend/btrixcloud/migrations/migration_0013_crawl_name.py b/backend/btrixcloud/migrations/migration_0013_crawl_name.py new file mode 100644 index 00000000..9b511101 --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0013_crawl_name.py @@ -0,0 +1,42 @@ +""" +Migration 0013 - Copy config name to crawls +""" +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0013" + + +class Migration(BaseMigration): + """Migration class.""" + + def __init__(self, mdb, migration_version=MIGRATION_VERSION): + super().__init__(mdb, migration_version) + + async def migrate_up(self): + """Perform migration up. + + Copy crawl config names to associated crawls. + """ + # pylint: disable=duplicate-code + crawls = self.mdb["crawls"] + crawl_configs = self.mdb["crawl_configs"] + + configs = [res async for res in crawl_configs.find({"inactive": {"$ne": True}})] + if not configs: + return + + for config in configs: + config_id = config["_id"] + try: + if not config.get("name"): + continue + await crawls.update_many( + {"cid": config_id}, {"$set": {"name": config.get("name")}} + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Unable to set name for crawls from with config {config_id}: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 69b005eb..87839900 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -296,6 +296,8 @@ class BaseCrawl(BaseMongoModel): started: datetime finished: Optional[datetime] + name: Optional[str] + state: str stats: Optional[Dict[str, int]] @@ -368,7 +370,9 @@ class CrawlOutWithResources(CrawlOut): class UpdateCrawl(BaseModel): """Update crawl""" - tags: Optional[List[str]] = [] + name: Optional[str] + description: Optional[str] + tags: Optional[List[str]] description: Optional[str] @@ -433,7 +437,6 @@ class UploadedCrawl(BaseCrawl): type: str = Field("upload", const=True) - name: str tags: Optional[List[str]] = [] @@ -441,8 +444,6 @@ class UploadedCrawl(BaseCrawl): class UpdateUpload(UpdateCrawl): """Update modal that also includes name""" - name: Optional[str] - # ============================================================================ diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index a22e41ba..d4313a45 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -299,6 +299,7 @@ def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_d org: Organization = Depends(org_viewer_dep), pageSize: int = DEFAULT_PAGE_SIZE, page: int = 1, + state: Optional[str] = None, userid: Optional[UUID4] = None, name: Optional[str] = None, description: Optional[str] = None, @@ -306,9 +307,18 @@ def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_d sortBy: Optional[str] = "finished", sortDirection: Optional[int] = -1, ): + states = state.split(",") if state else None + + if name: + name = unquote(name) + + if description: + description = unquote(description) + uploads, total = await ops.list_all_base_crawls( org, userid=userid, + states=states, name=name, description=description, page_size=pageSize, diff --git a/backend/test/conftest.py b/backend/test/conftest.py index 25b75d9e..a52f6297 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -18,6 +18,7 @@ CRAWLER_PW = "crawlerPASSWORD!" _admin_config_id = None _crawler_config_id = None _auto_add_config_id = None +_all_crawls_config_id = None NON_DEFAULT_ORG_NAME = "Non-default org" @@ -118,6 +119,12 @@ def admin_config_id(admin_crawl_id): return _admin_config_id +@pytest.fixture(scope="session") +def admin_userid(admin_auth_headers): + r = requests.get(f"{API_PREFIX}/users/me", headers=admin_auth_headers) + return r.json()["id"] + + @pytest.fixture(scope="session") def viewer_auth_headers(admin_auth_headers, default_org_id): requests.post( @@ -331,6 +338,54 @@ def auto_add_config_id(auto_add_crawl_id): return _auto_add_config_id +@pytest.fixture(scope="session") +def all_crawls_crawl_id(crawler_auth_headers, default_org_id): + # Start crawl. + crawl_data = { + "runNow": True, + "name": "All Crawls Test Crawl", + "description": "Lorem ipsum", + "config": { + "seeds": [{"url": "https://webrecorder.net/"}], + }, + } + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=crawl_data, + ) + data = r.json() + + global _all_crawls_config_id + _all_crawls_config_id = data["id"] + + crawl_id = data["run_now_job"] + # Wait for it to complete and then return crawl ID + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", + headers=crawler_auth_headers, + ) + data = r.json() + if data["state"] == "complete": + break + time.sleep(5) + + # Add description to crawl + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}", + headers=crawler_auth_headers, + json={"description": "Lorem ipsum"}, + ) + assert r.status_code == 200 + return crawl_id + + +@pytest.fixture(scope="session") +def all_crawls_config_id(all_crawls_crawl_id): + return _all_crawls_config_id + + @pytest.fixture(scope="session") def uploads_collection_id(crawler_auth_headers, default_org_id): r = requests.post( diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index e4fb89f1..2c168023 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -191,10 +191,11 @@ def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id): # Submit patch request to update tags and description UPDATED_TAGS = ["wr-test-1-updated", "wr-test-2-updated"] UPDATED_DESC = "Lorem ipsum test note." + UPDATED_NAME = "Updated crawl name" r = requests.patch( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}", headers=admin_auth_headers, - json={"tags": UPDATED_TAGS, "description": UPDATED_DESC}, + json={"tags": UPDATED_TAGS, "description": UPDATED_DESC, "name": UPDATED_NAME}, ) assert r.status_code == 200 data = r.json() @@ -209,6 +210,7 @@ def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id): data = r.json() assert sorted(data["tags"]) == sorted(UPDATED_TAGS) assert data["description"] == UPDATED_DESC + assert data["name"] == UPDATED_NAME # Verify deleting works as well r = requests.patch( diff --git a/backend/test/test_stop_cancel_crawl.py b/backend/test/test_stop_cancel_crawl.py index 225840fd..bbc114be 100644 --- a/backend/test/test_stop_cancel_crawl.py +++ b/backend/test/test_stop_cancel_crawl.py @@ -48,7 +48,14 @@ def test_cancel_crawl(default_org_id, crawler_auth_headers): data = get_crawl(default_org_id, crawler_auth_headers, crawl_id) - while data["state"] in ("running", "waiting_capacity"): + while data["state"] in ( + "starting", + "running", + "waiting_capacity", + "generate-wacz", + "uploading-wacz", + "pending-wait", + ): time.sleep(5) data = get_crawl(default_org_id, crawler_auth_headers, crawl_id) @@ -88,7 +95,14 @@ def test_start_crawl_and_stop_immediately( ) assert r.json()["lastCrawlStopping"] == True - while data["state"] in ("starting", "running", "waiting_capacity"): + while data["state"] in ( + "starting", + "running", + "waiting_capacity", + "generate-wacz", + "uploading-wacz", + "pending-wait", + ): time.sleep(5) data = get_crawl(default_org_id, crawler_auth_headers, crawl_id) @@ -149,7 +163,12 @@ def test_stop_crawl_partial( ) assert r.json()["lastCrawlStopping"] == True - while data["state"] == "running": + while data["state"] in ( + "running", + "generate-wacz", + "uploading-wacz", + "pending-wait", + ): time.sleep(5) data = get_crawl(default_org_id, crawler_auth_headers, crawl_id) diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 2a46206a..db13cf9d 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -9,6 +9,8 @@ upload_id = None upload_id_2 = None upload_dl_path = None +_coll_id = None + curr_dir = os.path.dirname(os.path.realpath(__file__)) @@ -371,6 +373,275 @@ def test_list_all_crawls(admin_auth_headers, default_org_id): assert item["state"] +def test_get_all_crawls_by_name(admin_auth_headers, default_org_id): + """Test filtering /all-crawls by name""" + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?name=test2.wacz", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 1 + items = data["items"] + assert items[0]["id"] == upload_id_2 + assert items[0]["name"] == "test2.wacz" + + crawl_name = "Crawler User Test Crawl" + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?name={crawl_name}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 3 + for item in data["items"]: + assert item["name"] == crawl_name + + +def test_get_all_crawls_by_first_seed( + admin_auth_headers, default_org_id, crawler_crawl_id +): + """Test filtering /all-crawls by first seed""" + first_seed = "https://webrecorder.net/" + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?firstSeed={first_seed}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 3 + for item in data["items"]: + assert item["firstSeed"] == first_seed + + +def test_get_all_crawls_by_type(admin_auth_headers, default_org_id, admin_crawl_id): + """Test filtering /all-crawls by crawl type""" + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=crawl", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 3 + for item in data["items"]: + assert item["type"] == "crawl" + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=upload", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 3 + for item in data["items"]: + assert item["type"] == "upload" + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=invalid", + headers=admin_auth_headers, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_crawl_type" + + +def test_get_all_crawls_by_user(admin_auth_headers, default_org_id, crawler_userid): + """Test filtering /all-crawls by userid""" + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?userid={crawler_userid}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 4 + for item in data["items"]: + assert item["userid"] == crawler_userid + + +def test_get_all_crawls_by_cid( + admin_auth_headers, default_org_id, all_crawls_config_id +): + """Test filtering /all-crawls by cid""" + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?cid={all_crawls_config_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 1 + assert data["items"][0]["cid"] == all_crawls_config_id + + +def test_get_all_crawls_by_state(admin_auth_headers, default_org_id, admin_crawl_id): + """Test filtering /all-crawls by cid""" + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?state=complete,partial_complete", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 5 + items = data["items"] + for item in items: + assert item["state"] in ("complete", "partial_complete") + + +def test_get_all_crawls_by_collection_id( + admin_auth_headers, default_org_id, admin_config_id, all_crawls_crawl_id +): + """Test filtering /all-crawls by collection id""" + # Create collection and add upload to it + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/collections", + headers=admin_auth_headers, + json={ + "crawlIds": [all_crawls_crawl_id], + "name": "all-crawls collection", + }, + ) + assert r.status_code == 200 + global _coll_id + _coll_id = r.json()["id"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?collectionId={_coll_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["total"] == 1 + assert r.json()["items"][0]["id"] == all_crawls_crawl_id + + +def test_sort_all_crawls(admin_auth_headers, default_org_id, admin_crawl_id): + # Sort by started, descending (default) + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started", + headers=admin_auth_headers, + ) + data = r.json() + assert data["total"] == 7 + items = data["items"] + assert len(items) == 7 + + last_created = None + for crawl in items: + if last_created: + assert crawl["started"] <= last_created + last_created = crawl["started"] + + # Sort by started, ascending + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started&sortDirection=1", + headers=admin_auth_headers, + ) + data = r.json() + items = data["items"] + + last_created = None + for crawl in items: + if last_created: + assert crawl["started"] >= last_created + last_created = crawl["started"] + + # Sort by finished + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=finished", + headers=admin_auth_headers, + ) + data = r.json() + items = data["items"] + + last_finished = None + for crawl in items: + if not crawl["finished"]: + continue + if last_finished: + assert crawl["finished"] <= last_finished + last_finished = crawl["finished"] + + # Sort by finished, ascending + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=finished&sortDirection=1", + headers=admin_auth_headers, + ) + data = r.json() + items = data["items"] + + last_finished = None + for crawl in items: + if not crawl["finished"]: + continue + if last_finished: + assert crawl["finished"] >= last_finished + last_finished = crawl["finished"] + + # Sort by fileSize + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=fileSize", + headers=admin_auth_headers, + ) + data = r.json() + items = data["items"] + + last_size = None + for crawl in items: + if last_size: + assert crawl["fileSize"] <= last_size + last_size = crawl["fileSize"] + + # Sort by fileSize, ascending + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=fileSize&sortDirection=1", + headers=admin_auth_headers, + ) + data = r.json() + items = data["items"] + + last_size = None + for crawl in items: + if last_size: + assert crawl["fileSize"] >= last_size + last_size = crawl["fileSize"] + + # Invalid sort value + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=invalid", + headers=admin_auth_headers, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_sort_by" + + # Invalid sort_direction value + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started&sortDirection=0", + headers=admin_auth_headers, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_sort_direction" + + +def test_all_crawls_search_values(admin_auth_headers, default_org_id): + """Test that all-crawls search values return expected results""" + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values", + headers=admin_auth_headers, + ) + data = r.json() + + assert len(data["names"]) == 5 + expected_names = [ + "Crawler User Test Crawl", + "My Upload Updated", + "test2.wacz", + "All Crawls Test Crawl", + ] + for expected_name in expected_names: + assert expected_name in data["names"] + + assert sorted(data["descriptions"]) == ["Lorem ipsum"] + assert sorted(data["firstSeeds"]) == ["https://webrecorder.net/"] + assert len(data["crawlIds"]) == 7 + + def test_get_upload_from_all_crawls(admin_auth_headers, default_org_id): """Test that /all-crawls lists crawls and uploads before deleting uploads""" r = requests.get( diff --git a/backend/test_nightly/test_concurrent_crawl_limit.py b/backend/test_nightly/test_concurrent_crawl_limit.py index 3f0e680c..7f6ee3d2 100644 --- a/backend/test_nightly/test_concurrent_crawl_limit.py +++ b/backend/test_nightly/test_concurrent_crawl_limit.py @@ -31,8 +31,11 @@ def test_run_two_only_one_concurrent(org_with_quotas, admin_auth_headers): ): time.sleep(2) - assert ( - get_crawl_status(org_with_quotas, crawl_id_a, admin_auth_headers) == "running" + assert get_crawl_status(org_with_quotas, crawl_id_a, admin_auth_headers) in ( + "running", + "generate-wacz", + "uploading-wacz", + "pending-wait", ) while ( @@ -68,6 +71,10 @@ def test_cancel_and_run_other(org_with_quotas, admin_auth_headers): assert get_crawl_status(org_with_quotas, crawl_id_b, admin_auth_headers) in ( "starting", "running", + "waiting_capacity", + "generate-wacz", + "uploading-wacz", + "pending-wait", ) # cancel second crawl as well