Backend: standardize search values, filters, and sorting for archived items (#1039)
- all-crawls list endpoint filters now conform to 'Standardize list controls for archived items #1025' and URL decode values before passing them in - Uploads list endpoint now includes all all-crawls filters relevant to uploads - An all-crawls/search-values endpoint is added to support searching across all archived item types - Crawl configuration names are now copied to the crawl when the crawl is created, and crawl names and descriptions are now editable via the backend API (note: this will require frontend changes as well to make them editable via the UI) - Migration added to copy existing config names for active configs into their associated crawls. This migration has been tested in a local deployment - New statuses generate-wacz, uploading-wacz, and pending-wait are added when relevant to tests to ensure that they pass - Tests coverage added for all new all-crawls endpoints, filters, and sort values
This commit is contained in:
parent
9236a07800
commit
7ff57ce6b5
@ -5,6 +5,7 @@ import uuid
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from typing import Optional, List, Union
|
||||
import urllib.parse
|
||||
|
||||
from pydantic import UUID4
|
||||
from fastapi import HTTPException, Depends
|
||||
@ -196,16 +197,11 @@ class BaseCrawlOps:
|
||||
config = await self.crawl_configs.get_crawl_config(
|
||||
crawl.cid, org, active_only=False
|
||||
)
|
||||
|
||||
if config:
|
||||
if not crawl.name:
|
||||
crawl.name = config.name
|
||||
|
||||
if config.config.seeds:
|
||||
if add_first_seed:
|
||||
first_seed = config.config.seeds[0]
|
||||
crawl.firstSeed = first_seed.url
|
||||
crawl.seedCount = len(config.config.seeds)
|
||||
if config and config.config.seeds:
|
||||
if add_first_seed:
|
||||
first_seed = config.config.seeds[0]
|
||||
crawl.firstSeed = first_seed.url
|
||||
crawl.seedCount = len(config.config.seeds)
|
||||
|
||||
if hasattr(crawl, "profileid") and crawl.profileid:
|
||||
crawl.profileName = await self.crawl_configs.profiles.get_profile_name(
|
||||
@ -327,7 +323,7 @@ class BaseCrawlOps:
|
||||
{"$pull": {"collections": collection_id}},
|
||||
)
|
||||
|
||||
# pylint: disable=too-many-branches
|
||||
# pylint: disable=too-many-branches, invalid-name
|
||||
async def list_all_base_crawls(
|
||||
self,
|
||||
org: Optional[Organization] = None,
|
||||
@ -336,12 +332,14 @@ class BaseCrawlOps:
|
||||
description: str = None,
|
||||
collection_id: str = None,
|
||||
states: Optional[List[str]] = None,
|
||||
first_seed: Optional[str] = None,
|
||||
type_: Optional[str] = None,
|
||||
cid: Optional[UUID4] = None,
|
||||
cls_type: Union[CrawlOut, CrawlOutWithResources] = CrawlOut,
|
||||
page_size: int = DEFAULT_PAGE_SIZE,
|
||||
page: int = 1,
|
||||
sort_by: str = None,
|
||||
sort_direction: int = -1,
|
||||
type_=None,
|
||||
):
|
||||
"""List crawls of all types from the db"""
|
||||
# Zero-index page for query
|
||||
@ -367,7 +365,15 @@ class BaseCrawlOps:
|
||||
# validated_states = [value for value in state if value in ALL_CRAWL_STATES]
|
||||
query["state"] = {"$in": states}
|
||||
|
||||
aggregate = [{"$match": query}, {"$unset": "errors"}]
|
||||
if cid:
|
||||
query["cid"] = cid
|
||||
|
||||
aggregate = [
|
||||
{"$match": query},
|
||||
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
|
||||
{"$set": {"firstSeed": "$firstSeedObject.url"}},
|
||||
{"$unset": ["firstSeedObject", "errors"]},
|
||||
]
|
||||
|
||||
if not resources:
|
||||
aggregate.extend([{"$unset": ["files"]}])
|
||||
@ -375,6 +381,9 @@ class BaseCrawlOps:
|
||||
if name:
|
||||
aggregate.extend([{"$match": {"name": name}}])
|
||||
|
||||
if first_seed:
|
||||
aggregate.extend([{"$match": {"firstSeed": first_seed}}])
|
||||
|
||||
if description:
|
||||
aggregate.extend([{"$match": {"description": description}}])
|
||||
|
||||
@ -382,7 +391,7 @@ class BaseCrawlOps:
|
||||
aggregate.extend([{"$match": {"collections": {"$in": [collection_id]}}}])
|
||||
|
||||
if sort_by:
|
||||
if sort_by not in ("started", "finished"):
|
||||
if sort_by not in ("started", "finished", "fileSize"):
|
||||
raise HTTPException(status_code=400, detail="invalid_sort_by")
|
||||
if sort_direction not in (1, -1):
|
||||
raise HTTPException(status_code=400, detail="invalid_sort_direction")
|
||||
@ -447,13 +456,40 @@ class BaseCrawlOps:
|
||||
|
||||
return {"deleted": True}
|
||||
|
||||
async def get_all_crawl_search_values(self, org: Organization):
|
||||
"""List unique names, first seeds, and descriptions from all captures in org"""
|
||||
names = await self.crawls.distinct("name", {"oid": org.id})
|
||||
descriptions = await self.crawls.distinct("description", {"oid": org.id})
|
||||
crawl_ids = await self.crawls.distinct("_id", {"oid": org.id})
|
||||
cids = await self.crawls.distinct("cid", {"oid": org.id})
|
||||
|
||||
# Remove empty strings
|
||||
names = [name for name in names if name]
|
||||
descriptions = [description for description in descriptions if description]
|
||||
|
||||
# Get first seeds
|
||||
first_seeds = set()
|
||||
for cid in cids:
|
||||
if not cid:
|
||||
continue
|
||||
config = await self.crawl_configs.get_crawl_config(cid, org)
|
||||
first_seed = config.config.seeds[0]
|
||||
first_seeds.add(first_seed.url)
|
||||
|
||||
return {
|
||||
"names": names,
|
||||
"descriptions": descriptions,
|
||||
"firstSeeds": list(first_seeds),
|
||||
"crawlIds": list(crawl_ids),
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
def init_base_crawls_api(
|
||||
app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep
|
||||
):
|
||||
"""base crawls api"""
|
||||
# pylint: disable=invalid-name, duplicate-code, too-many-arguments
|
||||
# pylint: disable=invalid-name, duplicate-code, too-many-arguments, too-many-locals
|
||||
|
||||
ops = BaseCrawlOps(mdb, users, crawl_config_ops, crawl_manager)
|
||||
|
||||
@ -472,12 +508,28 @@ def init_base_crawls_api(
|
||||
userid: Optional[UUID4] = None,
|
||||
name: Optional[str] = None,
|
||||
state: Optional[str] = None,
|
||||
firstSeed: Optional[str] = None,
|
||||
description: Optional[str] = None,
|
||||
collectionId: Optional[UUID4] = None,
|
||||
crawlType: Optional[str] = None,
|
||||
cid: Optional[UUID4] = None,
|
||||
sortBy: Optional[str] = "finished",
|
||||
sortDirection: Optional[int] = -1,
|
||||
):
|
||||
states = state.split(",") if state else None
|
||||
|
||||
if firstSeed:
|
||||
firstSeed = urllib.parse.unquote(firstSeed)
|
||||
|
||||
if name:
|
||||
name = urllib.parse.unquote(name)
|
||||
|
||||
if description:
|
||||
description = urllib.parse.unquote(description)
|
||||
|
||||
if crawlType and crawlType not in ("crawl", "upload"):
|
||||
raise HTTPException(status_code=400, detail="invalid_crawl_type")
|
||||
|
||||
crawls, total = await ops.list_all_base_crawls(
|
||||
org,
|
||||
userid=userid,
|
||||
@ -485,6 +537,9 @@ def init_base_crawls_api(
|
||||
description=description,
|
||||
collection_id=collectionId,
|
||||
states=states,
|
||||
first_seed=firstSeed,
|
||||
type_=crawlType,
|
||||
cid=cid,
|
||||
page_size=pageSize,
|
||||
page=page,
|
||||
sort_by=sortBy,
|
||||
@ -492,6 +547,12 @@ def init_base_crawls_api(
|
||||
)
|
||||
return paginated_format(crawls, total, page, pageSize)
|
||||
|
||||
@app.get("/orgs/{oid}/all-crawls/search-values", tags=["all-crawls"])
|
||||
async def get_all_crawls_search_values(
|
||||
org: Organization = Depends(org_viewer_dep),
|
||||
):
|
||||
return await ops.get_all_crawl_search_values(org)
|
||||
|
||||
@app.get(
|
||||
"/orgs/{oid}/all-crawls/{crawl_id}",
|
||||
tags=["all-crawls"],
|
||||
|
||||
@ -68,11 +68,15 @@ class CrawlOps(BaseCrawlOps):
|
||||
await self.crawls.create_index(
|
||||
[("type", pymongo.HASHED), ("state", pymongo.DESCENDING)]
|
||||
)
|
||||
await self.crawls.create_index(
|
||||
[("type", pymongo.HASHED), ("fileSize", pymongo.DESCENDING)]
|
||||
)
|
||||
|
||||
await self.crawls.create_index([("finished", pymongo.DESCENDING)])
|
||||
await self.crawls.create_index([("oid", pymongo.HASHED)])
|
||||
await self.crawls.create_index([("cid", pymongo.HASHED)])
|
||||
await self.crawls.create_index([("state", pymongo.HASHED)])
|
||||
await self.crawls.create_index([("fileSize", pymongo.DESCENDING)])
|
||||
|
||||
async def list_crawls(
|
||||
self,
|
||||
@ -127,15 +131,6 @@ class CrawlOps(BaseCrawlOps):
|
||||
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
|
||||
{"$set": {"firstSeed": "$firstSeedObject.url"}},
|
||||
{"$unset": ["firstSeedObject", "errors"]},
|
||||
{
|
||||
"$lookup": {
|
||||
"from": "crawl_configs",
|
||||
"localField": "cid",
|
||||
"foreignField": "_id",
|
||||
"as": "crawlConfig",
|
||||
},
|
||||
},
|
||||
{"$set": {"name": {"$arrayElemAt": ["$crawlConfig.name", 0]}}},
|
||||
]
|
||||
|
||||
if not resources:
|
||||
@ -154,7 +149,12 @@ class CrawlOps(BaseCrawlOps):
|
||||
aggregate.extend([{"$match": {"collections": {"$in": [collection_id]}}}])
|
||||
|
||||
if sort_by:
|
||||
if sort_by not in ("started", "finished", "fileSize", "firstSeed"):
|
||||
if sort_by not in (
|
||||
"started",
|
||||
"finished",
|
||||
"fileSize",
|
||||
"firstSeed",
|
||||
):
|
||||
raise HTTPException(status_code=400, detail="invalid_sort_by")
|
||||
if sort_direction not in (1, -1):
|
||||
raise HTTPException(status_code=400, detail="invalid_sort_direction")
|
||||
@ -545,6 +545,7 @@ async def add_new_crawl(
|
||||
manual=manual,
|
||||
started=started,
|
||||
tags=crawlconfig.tags,
|
||||
name=crawlconfig.name,
|
||||
)
|
||||
|
||||
try:
|
||||
|
||||
@ -15,7 +15,7 @@ from pymongo.errors import InvalidName
|
||||
from .migrations import BaseMigration
|
||||
|
||||
|
||||
CURR_DB_VERSION = "0012"
|
||||
CURR_DB_VERSION = "0013"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
||||
42
backend/btrixcloud/migrations/migration_0013_crawl_name.py
Normal file
42
backend/btrixcloud/migrations/migration_0013_crawl_name.py
Normal file
@ -0,0 +1,42 @@
|
||||
"""
|
||||
Migration 0013 - Copy config name to crawls
|
||||
"""
|
||||
from btrixcloud.migrations import BaseMigration
|
||||
|
||||
|
||||
MIGRATION_VERSION = "0013"
|
||||
|
||||
|
||||
class Migration(BaseMigration):
|
||||
"""Migration class."""
|
||||
|
||||
def __init__(self, mdb, migration_version=MIGRATION_VERSION):
|
||||
super().__init__(mdb, migration_version)
|
||||
|
||||
async def migrate_up(self):
|
||||
"""Perform migration up.
|
||||
|
||||
Copy crawl config names to associated crawls.
|
||||
"""
|
||||
# pylint: disable=duplicate-code
|
||||
crawls = self.mdb["crawls"]
|
||||
crawl_configs = self.mdb["crawl_configs"]
|
||||
|
||||
configs = [res async for res in crawl_configs.find({"inactive": {"$ne": True}})]
|
||||
if not configs:
|
||||
return
|
||||
|
||||
for config in configs:
|
||||
config_id = config["_id"]
|
||||
try:
|
||||
if not config.get("name"):
|
||||
continue
|
||||
await crawls.update_many(
|
||||
{"cid": config_id}, {"$set": {"name": config.get("name")}}
|
||||
)
|
||||
# pylint: disable=broad-exception-caught
|
||||
except Exception as err:
|
||||
print(
|
||||
f"Unable to set name for crawls from with config {config_id}: {err}",
|
||||
flush=True,
|
||||
)
|
||||
@ -296,6 +296,8 @@ class BaseCrawl(BaseMongoModel):
|
||||
started: datetime
|
||||
finished: Optional[datetime]
|
||||
|
||||
name: Optional[str]
|
||||
|
||||
state: str
|
||||
|
||||
stats: Optional[Dict[str, int]]
|
||||
@ -368,7 +370,9 @@ class CrawlOutWithResources(CrawlOut):
|
||||
class UpdateCrawl(BaseModel):
|
||||
"""Update crawl"""
|
||||
|
||||
tags: Optional[List[str]] = []
|
||||
name: Optional[str]
|
||||
description: Optional[str]
|
||||
tags: Optional[List[str]]
|
||||
description: Optional[str]
|
||||
|
||||
|
||||
@ -433,7 +437,6 @@ class UploadedCrawl(BaseCrawl):
|
||||
|
||||
type: str = Field("upload", const=True)
|
||||
|
||||
name: str
|
||||
tags: Optional[List[str]] = []
|
||||
|
||||
|
||||
@ -441,8 +444,6 @@ class UploadedCrawl(BaseCrawl):
|
||||
class UpdateUpload(UpdateCrawl):
|
||||
"""Update modal that also includes name"""
|
||||
|
||||
name: Optional[str]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
||||
|
||||
@ -299,6 +299,7 @@ def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_d
|
||||
org: Organization = Depends(org_viewer_dep),
|
||||
pageSize: int = DEFAULT_PAGE_SIZE,
|
||||
page: int = 1,
|
||||
state: Optional[str] = None,
|
||||
userid: Optional[UUID4] = None,
|
||||
name: Optional[str] = None,
|
||||
description: Optional[str] = None,
|
||||
@ -306,9 +307,18 @@ def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_d
|
||||
sortBy: Optional[str] = "finished",
|
||||
sortDirection: Optional[int] = -1,
|
||||
):
|
||||
states = state.split(",") if state else None
|
||||
|
||||
if name:
|
||||
name = unquote(name)
|
||||
|
||||
if description:
|
||||
description = unquote(description)
|
||||
|
||||
uploads, total = await ops.list_all_base_crawls(
|
||||
org,
|
||||
userid=userid,
|
||||
states=states,
|
||||
name=name,
|
||||
description=description,
|
||||
page_size=pageSize,
|
||||
|
||||
@ -18,6 +18,7 @@ CRAWLER_PW = "crawlerPASSWORD!"
|
||||
_admin_config_id = None
|
||||
_crawler_config_id = None
|
||||
_auto_add_config_id = None
|
||||
_all_crawls_config_id = None
|
||||
|
||||
NON_DEFAULT_ORG_NAME = "Non-default org"
|
||||
|
||||
@ -118,6 +119,12 @@ def admin_config_id(admin_crawl_id):
|
||||
return _admin_config_id
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def admin_userid(admin_auth_headers):
|
||||
r = requests.get(f"{API_PREFIX}/users/me", headers=admin_auth_headers)
|
||||
return r.json()["id"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def viewer_auth_headers(admin_auth_headers, default_org_id):
|
||||
requests.post(
|
||||
@ -331,6 +338,54 @@ def auto_add_config_id(auto_add_crawl_id):
|
||||
return _auto_add_config_id
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def all_crawls_crawl_id(crawler_auth_headers, default_org_id):
|
||||
# Start crawl.
|
||||
crawl_data = {
|
||||
"runNow": True,
|
||||
"name": "All Crawls Test Crawl",
|
||||
"description": "Lorem ipsum",
|
||||
"config": {
|
||||
"seeds": [{"url": "https://webrecorder.net/"}],
|
||||
},
|
||||
}
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||
headers=crawler_auth_headers,
|
||||
json=crawl_data,
|
||||
)
|
||||
data = r.json()
|
||||
|
||||
global _all_crawls_config_id
|
||||
_all_crawls_config_id = data["id"]
|
||||
|
||||
crawl_id = data["run_now_job"]
|
||||
# Wait for it to complete and then return crawl ID
|
||||
while True:
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
if data["state"] == "complete":
|
||||
break
|
||||
time.sleep(5)
|
||||
|
||||
# Add description to crawl
|
||||
r = requests.patch(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}",
|
||||
headers=crawler_auth_headers,
|
||||
json={"description": "Lorem ipsum"},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
return crawl_id
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def all_crawls_config_id(all_crawls_crawl_id):
|
||||
return _all_crawls_config_id
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def uploads_collection_id(crawler_auth_headers, default_org_id):
|
||||
r = requests.post(
|
||||
|
||||
@ -191,10 +191,11 @@ def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
|
||||
# Submit patch request to update tags and description
|
||||
UPDATED_TAGS = ["wr-test-1-updated", "wr-test-2-updated"]
|
||||
UPDATED_DESC = "Lorem ipsum test note."
|
||||
UPDATED_NAME = "Updated crawl name"
|
||||
r = requests.patch(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
||||
headers=admin_auth_headers,
|
||||
json={"tags": UPDATED_TAGS, "description": UPDATED_DESC},
|
||||
json={"tags": UPDATED_TAGS, "description": UPDATED_DESC, "name": UPDATED_NAME},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
@ -209,6 +210,7 @@ def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
|
||||
data = r.json()
|
||||
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
|
||||
assert data["description"] == UPDATED_DESC
|
||||
assert data["name"] == UPDATED_NAME
|
||||
|
||||
# Verify deleting works as well
|
||||
r = requests.patch(
|
||||
|
||||
@ -48,7 +48,14 @@ def test_cancel_crawl(default_org_id, crawler_auth_headers):
|
||||
|
||||
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||
|
||||
while data["state"] in ("running", "waiting_capacity"):
|
||||
while data["state"] in (
|
||||
"starting",
|
||||
"running",
|
||||
"waiting_capacity",
|
||||
"generate-wacz",
|
||||
"uploading-wacz",
|
||||
"pending-wait",
|
||||
):
|
||||
time.sleep(5)
|
||||
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||
|
||||
@ -88,7 +95,14 @@ def test_start_crawl_and_stop_immediately(
|
||||
)
|
||||
assert r.json()["lastCrawlStopping"] == True
|
||||
|
||||
while data["state"] in ("starting", "running", "waiting_capacity"):
|
||||
while data["state"] in (
|
||||
"starting",
|
||||
"running",
|
||||
"waiting_capacity",
|
||||
"generate-wacz",
|
||||
"uploading-wacz",
|
||||
"pending-wait",
|
||||
):
|
||||
time.sleep(5)
|
||||
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||
|
||||
@ -149,7 +163,12 @@ def test_stop_crawl_partial(
|
||||
)
|
||||
assert r.json()["lastCrawlStopping"] == True
|
||||
|
||||
while data["state"] == "running":
|
||||
while data["state"] in (
|
||||
"running",
|
||||
"generate-wacz",
|
||||
"uploading-wacz",
|
||||
"pending-wait",
|
||||
):
|
||||
time.sleep(5)
|
||||
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||
|
||||
|
||||
@ -9,6 +9,8 @@ upload_id = None
|
||||
upload_id_2 = None
|
||||
upload_dl_path = None
|
||||
|
||||
_coll_id = None
|
||||
|
||||
|
||||
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
@ -371,6 +373,275 @@ def test_list_all_crawls(admin_auth_headers, default_org_id):
|
||||
assert item["state"]
|
||||
|
||||
|
||||
def test_get_all_crawls_by_name(admin_auth_headers, default_org_id):
|
||||
"""Test filtering /all-crawls by name"""
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?name=test2.wacz",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["total"] == 1
|
||||
items = data["items"]
|
||||
assert items[0]["id"] == upload_id_2
|
||||
assert items[0]["name"] == "test2.wacz"
|
||||
|
||||
crawl_name = "Crawler User Test Crawl"
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?name={crawl_name}",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["total"] == 3
|
||||
for item in data["items"]:
|
||||
assert item["name"] == crawl_name
|
||||
|
||||
|
||||
def test_get_all_crawls_by_first_seed(
|
||||
admin_auth_headers, default_org_id, crawler_crawl_id
|
||||
):
|
||||
"""Test filtering /all-crawls by first seed"""
|
||||
first_seed = "https://webrecorder.net/"
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?firstSeed={first_seed}",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["total"] == 3
|
||||
for item in data["items"]:
|
||||
assert item["firstSeed"] == first_seed
|
||||
|
||||
|
||||
def test_get_all_crawls_by_type(admin_auth_headers, default_org_id, admin_crawl_id):
|
||||
"""Test filtering /all-crawls by crawl type"""
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=crawl",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["total"] == 3
|
||||
for item in data["items"]:
|
||||
assert item["type"] == "crawl"
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=upload",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["total"] == 3
|
||||
for item in data["items"]:
|
||||
assert item["type"] == "upload"
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=invalid",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 400
|
||||
assert r.json()["detail"] == "invalid_crawl_type"
|
||||
|
||||
|
||||
def test_get_all_crawls_by_user(admin_auth_headers, default_org_id, crawler_userid):
|
||||
"""Test filtering /all-crawls by userid"""
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?userid={crawler_userid}",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["total"] == 4
|
||||
for item in data["items"]:
|
||||
assert item["userid"] == crawler_userid
|
||||
|
||||
|
||||
def test_get_all_crawls_by_cid(
|
||||
admin_auth_headers, default_org_id, all_crawls_config_id
|
||||
):
|
||||
"""Test filtering /all-crawls by cid"""
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?cid={all_crawls_config_id}",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["total"] == 1
|
||||
assert data["items"][0]["cid"] == all_crawls_config_id
|
||||
|
||||
|
||||
def test_get_all_crawls_by_state(admin_auth_headers, default_org_id, admin_crawl_id):
|
||||
"""Test filtering /all-crawls by cid"""
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?state=complete,partial_complete",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["total"] == 5
|
||||
items = data["items"]
|
||||
for item in items:
|
||||
assert item["state"] in ("complete", "partial_complete")
|
||||
|
||||
|
||||
def test_get_all_crawls_by_collection_id(
|
||||
admin_auth_headers, default_org_id, admin_config_id, all_crawls_crawl_id
|
||||
):
|
||||
"""Test filtering /all-crawls by collection id"""
|
||||
# Create collection and add upload to it
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
||||
headers=admin_auth_headers,
|
||||
json={
|
||||
"crawlIds": [all_crawls_crawl_id],
|
||||
"name": "all-crawls collection",
|
||||
},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
global _coll_id
|
||||
_coll_id = r.json()["id"]
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?collectionId={_coll_id}",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
assert r.json()["total"] == 1
|
||||
assert r.json()["items"][0]["id"] == all_crawls_crawl_id
|
||||
|
||||
|
||||
def test_sort_all_crawls(admin_auth_headers, default_org_id, admin_crawl_id):
|
||||
# Sort by started, descending (default)
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
assert data["total"] == 7
|
||||
items = data["items"]
|
||||
assert len(items) == 7
|
||||
|
||||
last_created = None
|
||||
for crawl in items:
|
||||
if last_created:
|
||||
assert crawl["started"] <= last_created
|
||||
last_created = crawl["started"]
|
||||
|
||||
# Sort by started, ascending
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started&sortDirection=1",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
items = data["items"]
|
||||
|
||||
last_created = None
|
||||
for crawl in items:
|
||||
if last_created:
|
||||
assert crawl["started"] >= last_created
|
||||
last_created = crawl["started"]
|
||||
|
||||
# Sort by finished
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=finished",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
items = data["items"]
|
||||
|
||||
last_finished = None
|
||||
for crawl in items:
|
||||
if not crawl["finished"]:
|
||||
continue
|
||||
if last_finished:
|
||||
assert crawl["finished"] <= last_finished
|
||||
last_finished = crawl["finished"]
|
||||
|
||||
# Sort by finished, ascending
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=finished&sortDirection=1",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
items = data["items"]
|
||||
|
||||
last_finished = None
|
||||
for crawl in items:
|
||||
if not crawl["finished"]:
|
||||
continue
|
||||
if last_finished:
|
||||
assert crawl["finished"] >= last_finished
|
||||
last_finished = crawl["finished"]
|
||||
|
||||
# Sort by fileSize
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=fileSize",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
items = data["items"]
|
||||
|
||||
last_size = None
|
||||
for crawl in items:
|
||||
if last_size:
|
||||
assert crawl["fileSize"] <= last_size
|
||||
last_size = crawl["fileSize"]
|
||||
|
||||
# Sort by fileSize, ascending
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=fileSize&sortDirection=1",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
items = data["items"]
|
||||
|
||||
last_size = None
|
||||
for crawl in items:
|
||||
if last_size:
|
||||
assert crawl["fileSize"] >= last_size
|
||||
last_size = crawl["fileSize"]
|
||||
|
||||
# Invalid sort value
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=invalid",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 400
|
||||
assert r.json()["detail"] == "invalid_sort_by"
|
||||
|
||||
# Invalid sort_direction value
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started&sortDirection=0",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 400
|
||||
assert r.json()["detail"] == "invalid_sort_direction"
|
||||
|
||||
|
||||
def test_all_crawls_search_values(admin_auth_headers, default_org_id):
|
||||
"""Test that all-crawls search values return expected results"""
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
|
||||
assert len(data["names"]) == 5
|
||||
expected_names = [
|
||||
"Crawler User Test Crawl",
|
||||
"My Upload Updated",
|
||||
"test2.wacz",
|
||||
"All Crawls Test Crawl",
|
||||
]
|
||||
for expected_name in expected_names:
|
||||
assert expected_name in data["names"]
|
||||
|
||||
assert sorted(data["descriptions"]) == ["Lorem ipsum"]
|
||||
assert sorted(data["firstSeeds"]) == ["https://webrecorder.net/"]
|
||||
assert len(data["crawlIds"]) == 7
|
||||
|
||||
|
||||
def test_get_upload_from_all_crawls(admin_auth_headers, default_org_id):
|
||||
"""Test that /all-crawls lists crawls and uploads before deleting uploads"""
|
||||
r = requests.get(
|
||||
|
||||
@ -31,8 +31,11 @@ def test_run_two_only_one_concurrent(org_with_quotas, admin_auth_headers):
|
||||
):
|
||||
time.sleep(2)
|
||||
|
||||
assert (
|
||||
get_crawl_status(org_with_quotas, crawl_id_a, admin_auth_headers) == "running"
|
||||
assert get_crawl_status(org_with_quotas, crawl_id_a, admin_auth_headers) in (
|
||||
"running",
|
||||
"generate-wacz",
|
||||
"uploading-wacz",
|
||||
"pending-wait",
|
||||
)
|
||||
|
||||
while (
|
||||
@ -68,6 +71,10 @@ def test_cancel_and_run_other(org_with_quotas, admin_auth_headers):
|
||||
assert get_crawl_status(org_with_quotas, crawl_id_b, admin_auth_headers) in (
|
||||
"starting",
|
||||
"running",
|
||||
"waiting_capacity",
|
||||
"generate-wacz",
|
||||
"uploading-wacz",
|
||||
"pending-wait",
|
||||
)
|
||||
|
||||
# cancel second crawl as well
|
||||
|
||||
Loading…
Reference in New Issue
Block a user