browsertrix/backend/test/test_uploads.py
Tessa Walsh f7ba712646 Add seed file support to Browsertrix backend (#2710)
Fixes #2673 

Changes in this PR:

- Adds a new `file_uploads.py` module and corresponding `/files` API
prefix with methods/endpoints for uploading, GETing, and deleting seed
files (can be extended to other types of files moving forward)
- Seed files are supported via `CrawlConfig.config.seedFileId` on POST
and PATCH endpoints. This seedFileId is replaced by a presigned url when
passed to the crawler by the operator
- Seed files are read when first uploaded to calculate `firstSeed` and
`seedCount` and store them in the database, and this is copied into the
workflow and crawl documents when they are created.
- Logic is added to store `firstSeed` and `seedCount` for other
workflows as well, and a migration added to backfill data, to maintain
consistency and fix some of the pymongo aggregations that previously
assumed all workflows would have at least one `Seed` object in
`CrawlConfig.seeds`
- Seed file and thumbnail storage stats are added to org stats
- Seed file and thumbnail uploads first check that the org's storage
quota has not been exceeded and return a 400 if so
- A cron background job (run weekly each Sunday at midnight by default,
but configurable) is added to look for seed files at least x minutes old
(1440 minutes, or 1 day, by default, but configurable) that are not in
use in any workflows, and to delete them when they are found. The
backend pods will ensure this k8s batch job exists when starting up and
create it if it does not already exist. A database entry for each run of
the job is created in the operator on job completion so that it'll
appear in the `/jobs` API endpoints, but retrying of this type of
regularly scheduled background job is not supported as we don't want to
accidentally create multiple competing scheduled jobs.
- Adds a `min_seed_file_crawler_image` value to the Helm chart that is
checked before creating a crawl from a workflow if set. If a workflow
cannot be run, return the detail of the exception in
`CrawlConfigAddedResponse.errorDetail` so that we can display the reason
in the frontend
- Add SeedFile model from base UserFile (former ImageFIle), ensure all APIs
returning uploaded files return an absolute pre-signed URL (either with external origin or internal service origin)

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2025-07-22 19:11:02 -07:00

1188 lines
35 KiB
Python

import requests
import os
import time
from tempfile import TemporaryFile
from urllib.parse import urljoin
from zipfile import ZipFile, ZIP_STORED
import pytest
from .conftest import API_PREFIX
from .utils import read_in_chunks
curr_dir = os.path.dirname(os.path.realpath(__file__))
MAX_ATTEMPTS = 24
@pytest.fixture(scope="module")
def upload_id(admin_auth_headers, default_org_id, uploads_collection_id):
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload&description=Testing%0AData&collections={uploads_collection_id}&tags=one%2Ctwo",
headers=admin_auth_headers,
data=read_in_chunks(fh),
)
assert r.status_code == 200
assert r.json()["added"]
upload_id = r.json()["id"]
assert upload_id
return upload_id
@pytest.fixture(scope="module")
def upload_id_2(admin_auth_headers, default_org_id, uploads_collection_id):
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
data = fh.read()
files = [
("uploads", ("test.wacz", data, "application/octet-stream")),
("uploads", ("test-2.wacz", data, "application/octet-stream")),
("uploads", ("test.wacz", data, "application/octet-stream")),
]
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/formdata?name=test2.wacz&collections={uploads_collection_id}&tags=three%2Cfour",
headers=admin_auth_headers,
files=files,
)
assert r.status_code == 200
data = r.json()
assert data["added"]
assert data["storageQuotaReached"] is False
upload_id_2 = r.json()["id"]
assert upload_id_2
return upload_id_2
@pytest.fixture(scope="module")
def replaced_upload_id(
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
):
# Replace upload_id with a non-existent upload
actual_id = do_upload_replace(
admin_auth_headers, default_org_id, upload_id, uploads_collection_id
)
assert actual_id
assert actual_id != upload_id
return actual_id
def test_list_stream_upload(
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers,
)
results = r.json()
assert len(results["items"]) > 0
found = None
for res in results["items"]:
if res["id"] == upload_id:
found = res
assert found
assert found["name"] == "My Upload"
assert found["description"] == "Testing\nData"
assert found["collectionIds"] == [uploads_collection_id]
assert sorted(found["tags"]) == ["one", "two"]
assert "files" not in found
assert "resources" not in found
def test_get_stream_upload(
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
result = r.json()
assert uploads_collection_id in result["collectionIds"]
assert "files" not in result
upload_dl_path = result["resources"][0]["path"]
assert "test-" in result["resources"][0]["name"]
assert result["resources"][0]["name"].endswith(".wacz")
dl_path = urljoin(API_PREFIX, upload_dl_path)
wacz_resp = requests.get(dl_path)
actual = wacz_resp.content
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
expected = fh.read()
assert len(actual) == len(expected)
assert actual == expected
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
def test_list_uploads(
admin_auth_headers, default_org_id, uploads_collection_id, upload_id_2
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers,
)
results = r.json()
assert len(results["items"]) > 1
found = None
for res in results["items"]:
if res["id"] == upload_id_2:
found = res
assert found
assert found["name"] == "test2.wacz"
assert found["collectionIds"] == [uploads_collection_id]
assert sorted(found["tags"]) == ["four", "three"]
assert "files" not in res
assert "resources" not in res
def test_collection_uploads(
admin_auth_headers, default_org_id, uploads_collection_id, upload_id, upload_id_2
):
# Test uploads filtered by collection
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads?collectionId={uploads_collection_id}",
headers=admin_auth_headers,
)
results = r.json()
assert len(results["items"]) == 2
assert results["items"][0]["id"] in (upload_id, upload_id_2)
assert results["items"][1]["id"] in (upload_id, upload_id_2)
# Test all crawls filtered by collection
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?collectionId={uploads_collection_id}",
headers=admin_auth_headers,
)
results = r.json()
assert len(results["items"]) == 2
assert results["items"][0]["id"] in (upload_id, upload_id_2)
assert results["items"][1]["id"] in (upload_id, upload_id_2)
def test_get_upload_replay_json(
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data
assert data["id"] == upload_id
assert data["name"] == "My Upload"
assert data["collectionIds"] == [uploads_collection_id]
assert sorted(data["tags"]) == ["one", "two"]
assert data["resources"]
assert data["resources"][0]["path"]
assert data["resources"][0]["size"]
assert data["resources"][0]["hash"]
assert data["errors"] == []
assert "files" not in data
assert data["version"] == 2
def test_get_upload_replay_json_admin(
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
):
r = requests.get(
f"{API_PREFIX}/orgs/all/uploads/{upload_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data
assert data["id"] == upload_id
assert data["name"] == "My Upload"
assert data["collectionIds"] == [uploads_collection_id]
assert sorted(data["tags"]) == ["one", "two"]
assert data["resources"]
assert data["resources"][0]["path"]
assert data["resources"][0]["size"]
assert data["resources"][0]["hash"]
assert data["errors"] == []
assert "files" not in data
assert data["version"] == 2
def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
# Give time for pages to finish being uploaded
time.sleep(10)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/pages",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] > 0
pages = data["items"]
for page in pages:
assert page["id"]
assert page["oid"]
assert page["crawl_id"] == upload_id
assert page["url"]
assert page["ts"]
assert page["filename"]
assert page.get("title") or page.get("title") is None
assert page["isSeed"]
page_id = pages[0]["id"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/pages/{page_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
page = r.json()
assert page["id"] == page_id
assert page["oid"]
assert page["crawl_id"]
assert page["url"]
assert page["ts"]
assert page["filename"]
assert page.get("title") or page.get("title") is None
assert page["isSeed"]
assert page["notes"] == []
assert page.get("userid") is None
assert page.get("modified") is None
assert page.get("approved") is None
# Check that pageCount and uniquePageCount stored on upload
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["pageCount"] > 0
assert data["uniquePageCount"] > 0
def test_uploads_collection_updated(
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
):
# Verify that collection is updated when WACZ is added on upload
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{uploads_collection_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlCount"] > 0
assert data["pageCount"] > 0
assert data["uniquePageCount"] > 0
assert data["totalSize"] > 0
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["modified"] >= data["created"]
def test_replace_upload(
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
):
actual_id = do_upload_replace(
admin_auth_headers, default_org_id, upload_id, uploads_collection_id
)
assert upload_id == actual_id
def do_upload_replace(
admin_auth_headers, default_org_id, upload_id, uploads_collection_id
):
with open(os.path.join(curr_dir, "data", "example-2.wacz"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload%20Updated&replaceId={upload_id}&collections={uploads_collection_id}",
headers=admin_auth_headers,
data=read_in_chunks(fh),
)
assert r.status_code == 200
assert r.json()["added"]
actual_id = r.json()["id"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{actual_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
result = r.json()
# only one file, previous file removed
assert len(result["resources"]) == 1
dl_path = urljoin(API_PREFIX, result["resources"][0]["path"])
wacz_resp = requests.get(dl_path)
actual = wacz_resp.content
with open(os.path.join(curr_dir, "data", "example-2.wacz"), "rb") as fh:
expected = fh.read()
assert len(actual) == len(expected)
assert actual == expected
return actual_id
def test_update_upload_metadata(admin_auth_headers, default_org_id, upload_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["name"] == "My Upload Updated"
assert not data["tags"]
assert not data["description"]
assert len(data["collectionIds"]) == 1
# Make new collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=admin_auth_headers,
json={"name": "Patch Update Test Collection"},
)
patch_coll_id = r.json()["id"]
# Submit patch request to update name, tags, and description
UPDATED_NAME = "New Upload Name"
UPDATED_TAGS = ["wr-test-1-updated", "wr-test-2-updated"]
UPDATED_DESC = "Lorem ipsum test note."
UPDATED_COLLECTION_IDS = [patch_coll_id]
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}",
headers=admin_auth_headers,
json={
"tags": UPDATED_TAGS,
"description": UPDATED_DESC,
"name": UPDATED_NAME,
"collectionIds": UPDATED_COLLECTION_IDS,
},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
# Verify update was successful
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
assert data["description"] == UPDATED_DESC
assert data["name"] == UPDATED_NAME
assert data["collectionIds"] == UPDATED_COLLECTION_IDS
def test_download_wacz_uploads(admin_auth_headers, default_org_id, upload_id):
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/download",
headers=admin_auth_headers,
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)
fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()
assert len(contents) == 2
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
def test_delete_stream_upload(
admin_auth_headers, crawler_auth_headers, default_org_id, upload_id
):
# Verify non-admin user who didn't upload crawl can't delete it
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete",
headers=crawler_auth_headers,
json={"crawl_ids": [upload_id]},
)
assert r.status_code == 403
assert r.json()["detail"] == "not_allowed"
# Verify user who created upload can delete it
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete",
headers=admin_auth_headers,
json={"crawl_ids": [upload_id]},
)
data = r.json()
assert data["deleted"]
assert data["storageQuotaReached"] is False
def test_ensure_deleted(admin_auth_headers, default_org_id, upload_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers,
)
results = r.json()
for res in results["items"]:
if res["id"] == upload_id:
assert False
def test_verify_from_upload_resource_count(
admin_auth_headers, default_org_id, upload_id_2
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id_2}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
result = r.json()
assert "files" not in result
assert len(result["resources"]) == 3
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id_2}",
headers=admin_auth_headers,
)
assert r.status_code == 200
def test_list_all_crawls(
admin_auth_headers, default_org_id, replaced_upload_id, upload_id_2
):
"""Test that /all-crawls lists crawls and uploads before deleting uploads"""
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
items = data["items"]
assert len(items) == data["total"]
crawls = [item for item in items if item["type"] == "crawl"]
assert len(crawls) > 0
uploads = [item for item in items if item["type"] == "upload"]
assert len(uploads) > 0
for item in items:
assert item["type"] in ("crawl", "upload")
if item["type"] == "crawl":
assert item["firstSeed"]
assert item["seedCount"]
assert item.get("name") or item.get("name") == ""
assert item["id"]
assert item["userid"]
assert item["oid"] == default_org_id
assert item["started"]
assert item["finished"]
assert item["state"]
assert item["version"] == 2
# Test that all-crawls lastQAState and lastQAStarted sorts always puts crawls before uploads
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=lastQAState",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
last_type = None
for item in data["items"]:
if last_type == "upload":
assert item["type"] != "crawl"
last_type = item["type"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=lastQAStarted",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
last_type = None
for item in data["items"]:
if last_type == "upload":
assert item["type"] != "crawl"
last_type = item["type"]
def test_get_all_crawls_by_name(
admin_auth_headers, default_org_id, replaced_upload_id, upload_id_2
):
"""Test filtering /all-crawls by name"""
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?name=test2.wacz",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
items = data["items"]
assert items[0]["id"] == upload_id_2
assert items[0]["name"] == "test2.wacz"
crawl_name = "Crawler User Test Crawl"
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?name={crawl_name}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
for item in data["items"]:
assert item["name"] == crawl_name
def test_get_all_crawls_by_first_seed(
admin_auth_headers,
default_org_id,
crawler_crawl_id,
replaced_upload_id,
upload_id_2,
):
"""Test filtering /all-crawls by first seed"""
first_seed = "https://webrecorder.net/"
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?firstSeed={first_seed}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
for item in data["items"]:
assert item["firstSeed"] == first_seed
def test_get_all_crawls_by_type(
admin_auth_headers, default_org_id, admin_crawl_id, replaced_upload_id, upload_id_2
):
"""Test filtering /all-crawls by crawl type"""
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=crawl",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 7
for item in data["items"]:
assert item["type"] == "crawl"
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=upload",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
for item in data["items"]:
assert item["type"] == "upload"
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=invalid",
headers=admin_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_crawl_type"
def test_get_all_crawls_by_user(
admin_auth_headers, default_org_id, crawler_userid, replaced_upload_id, upload_id_2
):
"""Test filtering /all-crawls by userid"""
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?userid={crawler_userid}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 6
for item in data["items"]:
assert item["userid"] == crawler_userid
def test_get_all_crawls_by_cid(
admin_auth_headers, default_org_id, all_crawls_config_id
):
"""Test filtering /all-crawls by cid"""
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?cid={all_crawls_config_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
assert data["items"][0]["cid"] == all_crawls_config_id
def test_get_all_crawls_by_state(
admin_auth_headers, default_org_id, admin_crawl_id, replaced_upload_id, upload_id_2
):
"""Test filtering /all-crawls by cid"""
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?state=complete,stopped_by_user",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 5
items = data["items"]
for item in items:
assert item["state"] in (
"complete",
"stopped_by_user",
)
def test_get_all_crawls_by_collection_id(
admin_auth_headers, default_org_id, admin_config_id, all_crawls_crawl_id
):
"""Test filtering /all-crawls by collection id"""
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=admin_auth_headers,
json={
"crawlIds": [all_crawls_crawl_id],
"name": "all-crawls collection",
},
)
assert r.status_code == 200
new_coll_id = r.json()["id"]
assert new_coll_id
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?collectionId={new_coll_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 1
assert r.json()["items"][0]["id"] == all_crawls_crawl_id
def test_sort_all_crawls(
admin_auth_headers, default_org_id, admin_crawl_id, replaced_upload_id, upload_id_2
):
# Sort by started, descending (default)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started",
headers=admin_auth_headers,
)
data = r.json()
assert data["total"] >= 9
items = data["items"]
assert len(items) >= 9
last_created = None
for crawl in items:
if last_created:
assert crawl["started"] <= last_created
last_created = crawl["started"]
# Sort by started, ascending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started&sortDirection=1",
headers=admin_auth_headers,
)
data = r.json()
items = data["items"]
last_created = None
for crawl in items:
if last_created:
assert crawl["started"] >= last_created
last_created = crawl["started"]
# Sort by finished
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=finished",
headers=admin_auth_headers,
)
data = r.json()
items = data["items"]
last_finished = None
for crawl in items:
if not crawl["finished"]:
continue
if last_finished:
assert crawl["finished"] <= last_finished
last_finished = crawl["finished"]
# Sort by finished, ascending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=finished&sortDirection=1",
headers=admin_auth_headers,
)
data = r.json()
items = data["items"]
last_finished = None
for crawl in items:
if not crawl["finished"]:
continue
if last_finished:
assert crawl["finished"] >= last_finished
last_finished = crawl["finished"]
# Sort by fileSize
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=fileSize",
headers=admin_auth_headers,
)
data = r.json()
items = data["items"]
last_size = None
for crawl in items:
if last_size:
assert crawl["fileSize"] <= last_size
last_size = crawl["fileSize"]
# Sort by fileSize, ascending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=fileSize&sortDirection=1",
headers=admin_auth_headers,
)
data = r.json()
items = data["items"]
last_size = None
for crawl in items:
if last_size:
assert crawl["fileSize"] >= last_size
last_size = crawl["fileSize"]
# Invalid sort value
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=invalid",
headers=admin_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_sort_by"
# Invalid sort_direction value
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started&sortDirection=0",
headers=admin_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_sort_direction"
def test_all_crawls_search_values(
admin_auth_headers, default_org_id, replaced_upload_id, upload_id_2
):
"""Test that all-crawls search values return expected results"""
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert len(data["names"]) == 9
expected_names = [
"Crawler User Test Crawl",
"Custom Behavior Logs",
"My Upload Updated",
"test2.wacz",
"All Crawls Test Crawl",
"Crawler User Crawl for Testing QA",
"Seed File Test Crawl",
]
for expected_name in expected_names:
assert expected_name in data["names"]
assert sorted(data["descriptions"]) == ["Lorem ipsum"]
assert sorted(data["firstSeeds"]) == [
"https://old.webrecorder.net/",
"https://specs.webrecorder.net/",
"https://webrecorder.net/",
]
# Test filtering by crawls
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values?crawlType=crawl",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert len(data["names"]) == 6
expected_names = [
"Admin Test Crawl",
"All Crawls Test Crawl",
"Crawler User Crawl for Testing QA",
"Crawler User Test Crawl",
"Custom Behavior Logs",
"Seed File Test Crawl",
]
for expected_name in expected_names:
assert expected_name in data["names"]
assert sorted(data["descriptions"]) == ["Lorem ipsum"]
assert sorted(data["firstSeeds"]) == [
"https://old.webrecorder.net/",
"https://specs.webrecorder.net/",
"https://webrecorder.net/",
]
# Test filtering by uploads
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values?crawlType=upload",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert len(data["names"]) == 3
expected_names = [
"My Upload Updated",
"test2.wacz",
]
for expected_name in expected_names:
assert expected_name in data["names"]
assert sorted(data["descriptions"]) == []
assert sorted(data["firstSeeds"]) == []
# Test invalid filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values?crawlType=invalid",
headers=admin_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_crawl_type"
def test_get_upload_from_all_crawls(admin_auth_headers, default_org_id, upload_id_2):
"""Test that /all-crawls lists crawls and uploads before deleting uploads"""
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id_2}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["name"] == "test2.wacz"
assert "files" not in data
assert data["resources"]
def test_get_upload_replay_json_from_all_crawls(
admin_auth_headers, default_org_id, upload_id_2
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id_2}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data
assert data["id"] == upload_id_2
assert data["name"] == "test2.wacz"
assert data["resources"]
assert data["resources"][0]["path"]
assert data["resources"][0]["size"]
assert data["resources"][0]["hash"]
assert data["errors"] == []
assert "files" not in data
assert data["version"] == 2
def test_get_upload_replay_json_admin_from_all_crawls(
admin_auth_headers, default_org_id, upload_id_2
):
r = requests.get(
f"{API_PREFIX}/orgs/all/all-crawls/{upload_id_2}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data
assert data["id"] == upload_id_2
assert data["name"] == "test2.wacz"
assert data["resources"]
assert data["resources"][0]["path"]
assert data["resources"][0]["size"]
assert data["resources"][0]["hash"]
assert data["errors"] == []
assert "files" not in data
assert data["version"] == 2
def test_update_upload_metadata_all_crawls(
admin_auth_headers, default_org_id, replaced_upload_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{replaced_upload_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["name"] == "My Upload Updated"
assert not data["tags"]
assert not data["description"]
assert len(data["collectionIds"]) == 1
# Make new collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=admin_auth_headers,
json={"name": "Patch Update Test Collection 2"},
)
patch_coll_id_2 = r.json()["id"]
# Submit patch request to update name, tags, and description
UPDATED_NAME = "New Upload Name 2"
UPDATED_TAGS = ["wr-test-1-updated-again", "wr-test-2-updated-again"]
UPDATED_DESC = "Lorem ipsum test note 2."
UPDATED_COLLECTION_IDS = [patch_coll_id_2]
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{replaced_upload_id}",
headers=admin_auth_headers,
json={
"tags": UPDATED_TAGS,
"description": UPDATED_DESC,
"name": UPDATED_NAME,
"collectionIds": UPDATED_COLLECTION_IDS,
},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
# Verify update was successful
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{replaced_upload_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
assert data["description"] == UPDATED_DESC
assert data["name"] == UPDATED_NAME
assert data["collectionIds"] == UPDATED_COLLECTION_IDS
# Submit patch request to set collections to empty list
UPDATED_COLLECTION_IDS = []
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{replaced_upload_id}",
headers=admin_auth_headers,
json={
"collectionIds": UPDATED_COLLECTION_IDS,
},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
# Verify update was successful
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{replaced_upload_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
assert data["description"] == UPDATED_DESC
assert data["name"] == UPDATED_NAME
assert data["collectionIds"] == []
def test_clear_all_presigned_urls(
admin_auth_headers, crawler_auth_headers, default_org_id
):
# All orgs
r = requests.post(
f"{API_PREFIX}/orgs/clear-presigned-urls",
headers=crawler_auth_headers,
)
assert r.status_code == 403
assert r.json()["detail"] == "Not Allowed"
r = requests.post(
f"{API_PREFIX}/orgs/clear-presigned-urls",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]
# Per-org
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/clear-presigned-urls",
headers=crawler_auth_headers,
)
assert r.status_code == 403
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/clear-presigned-urls",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]
def test_delete_form_upload_and_crawls_from_all_crawls(
admin_auth_headers,
crawler_auth_headers,
default_org_id,
all_crawls_delete_crawl_ids,
all_crawls_delete_config_id,
upload_id_2,
):
crawls_to_delete = all_crawls_delete_crawl_ids
crawls_to_delete.append(upload_id_2)
# Get org metrics
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/metrics",
headers=admin_auth_headers,
)
data = r.json()
org_bytes = data["storageUsedBytes"]
org_crawl_bytes = data["storageUsedCrawls"]
org_upload_bytes = data["storageUsedUploads"]
# Get workflow and crawl sizes
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{all_crawls_delete_config_id}",
headers=admin_auth_headers,
)
workflow_size = r.json()["totalSize"]
crawl_id_1 = all_crawls_delete_crawl_ids[0]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_1}/replay.json",
headers=admin_auth_headers,
)
crawl_1_size = r.json()["fileSize"]
crawl_id_2 = all_crawls_delete_crawl_ids[1]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_2}/replay.json",
headers=admin_auth_headers,
)
crawl_2_size = r.json()["fileSize"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id_2}/replay.json",
headers=admin_auth_headers,
)
upload_size = r.json()["fileSize"]
combined_crawl_size = crawl_1_size + crawl_2_size
total_size = combined_crawl_size + upload_size
# Verify that non-admin user can't delete another's items
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/delete",
headers=crawler_auth_headers,
json={"crawl_ids": crawls_to_delete},
)
assert r.status_code == 403
assert r.json()["detail"] == "not_allowed"
# Delete mixed type archived items
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/delete",
headers=admin_auth_headers,
json={"crawl_ids": crawls_to_delete},
)
data = r.json()
assert data["deleted"]
assert data["storageQuotaReached"] is False
# Check that org and workflow size figures are as expected
count = 0
while count < MAX_ATTEMPTS:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/metrics",
headers=admin_auth_headers,
)
data = r.json()
all_good = True
if data["storageUsedBytes"] != org_bytes - total_size:
all_good = False
if data["storageUsedCrawls"] != org_crawl_bytes - combined_crawl_size:
all_good = False
if data["storageUsedUploads"] != org_upload_bytes - upload_size:
all_good = False
if all_good:
break
if count + 1 == MAX_ATTEMPTS:
assert data["storageUsedBytes"] == org_bytes - total_size
assert data["storageUsedCrawls"] == org_crawl_bytes - combined_crawl_size
assert data["storageUsedUploads"] == org_upload_bytes - upload_size
time.sleep(5)
count += 1
count = 0
while count < MAX_ATTEMPTS:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{all_crawls_delete_config_id}",
headers=admin_auth_headers,
)
if r.json()["totalSize"] == workflow_size - combined_crawl_size:
break
if count + 1 == MAX_ATTEMPTS:
assert False
time.sleep(10)
count += 1