Fixes #2673 Changes in this PR: - Adds a new `file_uploads.py` module and corresponding `/files` API prefix with methods/endpoints for uploading, GETing, and deleting seed files (can be extended to other types of files moving forward) - Seed files are supported via `CrawlConfig.config.seedFileId` on POST and PATCH endpoints. This seedFileId is replaced by a presigned url when passed to the crawler by the operator - Seed files are read when first uploaded to calculate `firstSeed` and `seedCount` and store them in the database, and this is copied into the workflow and crawl documents when they are created. - Logic is added to store `firstSeed` and `seedCount` for other workflows as well, and a migration added to backfill data, to maintain consistency and fix some of the pymongo aggregations that previously assumed all workflows would have at least one `Seed` object in `CrawlConfig.seeds` - Seed file and thumbnail storage stats are added to org stats - Seed file and thumbnail uploads first check that the org's storage quota has not been exceeded and return a 400 if so - A cron background job (run weekly each Sunday at midnight by default, but configurable) is added to look for seed files at least x minutes old (1440 minutes, or 1 day, by default, but configurable) that are not in use in any workflows, and to delete them when they are found. The backend pods will ensure this k8s batch job exists when starting up and create it if it does not already exist. A database entry for each run of the job is created in the operator on job completion so that it'll appear in the `/jobs` API endpoints, but retrying of this type of regularly scheduled background job is not supported as we don't want to accidentally create multiple competing scheduled jobs. - Adds a `min_seed_file_crawler_image` value to the Helm chart that is checked before creating a crawl from a workflow if set. If a workflow cannot be run, return the detail of the exception in `CrawlConfigAddedResponse.errorDetail` so that we can display the reason in the frontend - Add SeedFile model from base UserFile (former ImageFIle), ensure all APIs returning uploaded files return an absolute pre-signed URL (either with external origin or internal service origin) --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
1188 lines
35 KiB
Python
1188 lines
35 KiB
Python
import requests
|
|
import os
|
|
import time
|
|
from tempfile import TemporaryFile
|
|
from urllib.parse import urljoin
|
|
from zipfile import ZipFile, ZIP_STORED
|
|
|
|
import pytest
|
|
|
|
from .conftest import API_PREFIX
|
|
from .utils import read_in_chunks
|
|
|
|
|
|
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
MAX_ATTEMPTS = 24
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def upload_id(admin_auth_headers, default_org_id, uploads_collection_id):
|
|
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
|
|
r = requests.put(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload&description=Testing%0AData&collections={uploads_collection_id}&tags=one%2Ctwo",
|
|
headers=admin_auth_headers,
|
|
data=read_in_chunks(fh),
|
|
)
|
|
|
|
assert r.status_code == 200
|
|
assert r.json()["added"]
|
|
|
|
upload_id = r.json()["id"]
|
|
assert upload_id
|
|
return upload_id
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def upload_id_2(admin_auth_headers, default_org_id, uploads_collection_id):
|
|
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
|
|
data = fh.read()
|
|
|
|
files = [
|
|
("uploads", ("test.wacz", data, "application/octet-stream")),
|
|
("uploads", ("test-2.wacz", data, "application/octet-stream")),
|
|
("uploads", ("test.wacz", data, "application/octet-stream")),
|
|
]
|
|
|
|
r = requests.put(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/formdata?name=test2.wacz&collections={uploads_collection_id}&tags=three%2Cfour",
|
|
headers=admin_auth_headers,
|
|
files=files,
|
|
)
|
|
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["added"]
|
|
assert data["storageQuotaReached"] is False
|
|
|
|
upload_id_2 = r.json()["id"]
|
|
assert upload_id_2
|
|
return upload_id_2
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def replaced_upload_id(
|
|
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
|
|
):
|
|
# Replace upload_id with a non-existent upload
|
|
actual_id = do_upload_replace(
|
|
admin_auth_headers, default_org_id, upload_id, uploads_collection_id
|
|
)
|
|
|
|
assert actual_id
|
|
assert actual_id != upload_id
|
|
return actual_id
|
|
|
|
|
|
def test_list_stream_upload(
|
|
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
|
|
headers=admin_auth_headers,
|
|
)
|
|
results = r.json()
|
|
|
|
assert len(results["items"]) > 0
|
|
|
|
found = None
|
|
|
|
for res in results["items"]:
|
|
if res["id"] == upload_id:
|
|
found = res
|
|
|
|
assert found
|
|
assert found["name"] == "My Upload"
|
|
assert found["description"] == "Testing\nData"
|
|
assert found["collectionIds"] == [uploads_collection_id]
|
|
assert sorted(found["tags"]) == ["one", "two"]
|
|
assert "files" not in found
|
|
assert "resources" not in found
|
|
|
|
|
|
def test_get_stream_upload(
|
|
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
result = r.json()
|
|
assert uploads_collection_id in result["collectionIds"]
|
|
assert "files" not in result
|
|
upload_dl_path = result["resources"][0]["path"]
|
|
assert "test-" in result["resources"][0]["name"]
|
|
assert result["resources"][0]["name"].endswith(".wacz")
|
|
|
|
dl_path = urljoin(API_PREFIX, upload_dl_path)
|
|
wacz_resp = requests.get(dl_path)
|
|
actual = wacz_resp.content
|
|
|
|
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
|
|
expected = fh.read()
|
|
|
|
assert len(actual) == len(expected)
|
|
assert actual == expected
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
|
|
def test_list_uploads(
|
|
admin_auth_headers, default_org_id, uploads_collection_id, upload_id_2
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
|
|
headers=admin_auth_headers,
|
|
)
|
|
results = r.json()
|
|
|
|
assert len(results["items"]) > 1
|
|
|
|
found = None
|
|
|
|
for res in results["items"]:
|
|
if res["id"] == upload_id_2:
|
|
found = res
|
|
|
|
assert found
|
|
assert found["name"] == "test2.wacz"
|
|
assert found["collectionIds"] == [uploads_collection_id]
|
|
assert sorted(found["tags"]) == ["four", "three"]
|
|
|
|
assert "files" not in res
|
|
assert "resources" not in res
|
|
|
|
|
|
def test_collection_uploads(
|
|
admin_auth_headers, default_org_id, uploads_collection_id, upload_id, upload_id_2
|
|
):
|
|
# Test uploads filtered by collection
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads?collectionId={uploads_collection_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
|
|
results = r.json()
|
|
|
|
assert len(results["items"]) == 2
|
|
assert results["items"][0]["id"] in (upload_id, upload_id_2)
|
|
assert results["items"][1]["id"] in (upload_id, upload_id_2)
|
|
|
|
# Test all crawls filtered by collection
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?collectionId={uploads_collection_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
|
|
results = r.json()
|
|
|
|
assert len(results["items"]) == 2
|
|
assert results["items"][0]["id"] in (upload_id, upload_id_2)
|
|
assert results["items"][1]["id"] in (upload_id, upload_id_2)
|
|
|
|
|
|
def test_get_upload_replay_json(
|
|
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data
|
|
assert data["id"] == upload_id
|
|
assert data["name"] == "My Upload"
|
|
assert data["collectionIds"] == [uploads_collection_id]
|
|
assert sorted(data["tags"]) == ["one", "two"]
|
|
assert data["resources"]
|
|
assert data["resources"][0]["path"]
|
|
assert data["resources"][0]["size"]
|
|
assert data["resources"][0]["hash"]
|
|
assert data["errors"] == []
|
|
assert "files" not in data
|
|
assert data["version"] == 2
|
|
|
|
|
|
def test_get_upload_replay_json_admin(
|
|
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/all/uploads/{upload_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data
|
|
assert data["id"] == upload_id
|
|
assert data["name"] == "My Upload"
|
|
assert data["collectionIds"] == [uploads_collection_id]
|
|
assert sorted(data["tags"]) == ["one", "two"]
|
|
assert data["resources"]
|
|
assert data["resources"][0]["path"]
|
|
assert data["resources"][0]["size"]
|
|
assert data["resources"][0]["hash"]
|
|
assert data["errors"] == []
|
|
assert "files" not in data
|
|
assert data["version"] == 2
|
|
|
|
|
|
def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
|
|
# Give time for pages to finish being uploaded
|
|
time.sleep(10)
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/pages",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["total"] > 0
|
|
|
|
pages = data["items"]
|
|
for page in pages:
|
|
assert page["id"]
|
|
assert page["oid"]
|
|
assert page["crawl_id"] == upload_id
|
|
assert page["url"]
|
|
assert page["ts"]
|
|
assert page["filename"]
|
|
assert page.get("title") or page.get("title") is None
|
|
assert page["isSeed"]
|
|
|
|
page_id = pages[0]["id"]
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/pages/{page_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
page = r.json()
|
|
|
|
assert page["id"] == page_id
|
|
assert page["oid"]
|
|
assert page["crawl_id"]
|
|
assert page["url"]
|
|
assert page["ts"]
|
|
assert page["filename"]
|
|
assert page.get("title") or page.get("title") is None
|
|
assert page["isSeed"]
|
|
|
|
assert page["notes"] == []
|
|
assert page.get("userid") is None
|
|
assert page.get("modified") is None
|
|
assert page.get("approved") is None
|
|
|
|
# Check that pageCount and uniquePageCount stored on upload
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
|
|
|
|
def test_uploads_collection_updated(
|
|
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
|
|
):
|
|
# Verify that collection is updated when WACZ is added on upload
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{uploads_collection_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["crawlCount"] > 0
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
assert data["modified"] >= data["created"]
|
|
|
|
|
|
def test_replace_upload(
|
|
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
|
|
):
|
|
actual_id = do_upload_replace(
|
|
admin_auth_headers, default_org_id, upload_id, uploads_collection_id
|
|
)
|
|
|
|
assert upload_id == actual_id
|
|
|
|
|
|
def do_upload_replace(
|
|
admin_auth_headers, default_org_id, upload_id, uploads_collection_id
|
|
):
|
|
with open(os.path.join(curr_dir, "data", "example-2.wacz"), "rb") as fh:
|
|
r = requests.put(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload%20Updated&replaceId={upload_id}&collections={uploads_collection_id}",
|
|
headers=admin_auth_headers,
|
|
data=read_in_chunks(fh),
|
|
)
|
|
|
|
assert r.status_code == 200
|
|
assert r.json()["added"]
|
|
actual_id = r.json()["id"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{actual_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
result = r.json()
|
|
|
|
# only one file, previous file removed
|
|
assert len(result["resources"]) == 1
|
|
|
|
dl_path = urljoin(API_PREFIX, result["resources"][0]["path"])
|
|
wacz_resp = requests.get(dl_path)
|
|
actual = wacz_resp.content
|
|
|
|
with open(os.path.join(curr_dir, "data", "example-2.wacz"), "rb") as fh:
|
|
expected = fh.read()
|
|
|
|
assert len(actual) == len(expected)
|
|
assert actual == expected
|
|
|
|
return actual_id
|
|
|
|
|
|
def test_update_upload_metadata(admin_auth_headers, default_org_id, upload_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["name"] == "My Upload Updated"
|
|
assert not data["tags"]
|
|
assert not data["description"]
|
|
assert len(data["collectionIds"]) == 1
|
|
|
|
# Make new collection
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=admin_auth_headers,
|
|
json={"name": "Patch Update Test Collection"},
|
|
)
|
|
patch_coll_id = r.json()["id"]
|
|
|
|
# Submit patch request to update name, tags, and description
|
|
UPDATED_NAME = "New Upload Name"
|
|
UPDATED_TAGS = ["wr-test-1-updated", "wr-test-2-updated"]
|
|
UPDATED_DESC = "Lorem ipsum test note."
|
|
UPDATED_COLLECTION_IDS = [patch_coll_id]
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"tags": UPDATED_TAGS,
|
|
"description": UPDATED_DESC,
|
|
"name": UPDATED_NAME,
|
|
"collectionIds": UPDATED_COLLECTION_IDS,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["updated"]
|
|
|
|
# Verify update was successful
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
|
|
assert data["description"] == UPDATED_DESC
|
|
assert data["name"] == UPDATED_NAME
|
|
assert data["collectionIds"] == UPDATED_COLLECTION_IDS
|
|
|
|
|
|
def test_download_wacz_uploads(admin_auth_headers, default_org_id, upload_id):
|
|
with TemporaryFile() as fh:
|
|
with requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/download",
|
|
headers=admin_auth_headers,
|
|
stream=True,
|
|
) as r:
|
|
assert r.status_code == 200
|
|
for chunk in r.iter_content():
|
|
fh.write(chunk)
|
|
|
|
fh.seek(0)
|
|
with ZipFile(fh, "r") as zip_file:
|
|
contents = zip_file.namelist()
|
|
|
|
assert len(contents) == 2
|
|
for filename in contents:
|
|
assert filename.endswith(".wacz") or filename == "datapackage.json"
|
|
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
|
|
|
|
|
|
def test_delete_stream_upload(
|
|
admin_auth_headers, crawler_auth_headers, default_org_id, upload_id
|
|
):
|
|
# Verify non-admin user who didn't upload crawl can't delete it
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete",
|
|
headers=crawler_auth_headers,
|
|
json={"crawl_ids": [upload_id]},
|
|
)
|
|
assert r.status_code == 403
|
|
assert r.json()["detail"] == "not_allowed"
|
|
|
|
# Verify user who created upload can delete it
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete",
|
|
headers=admin_auth_headers,
|
|
json={"crawl_ids": [upload_id]},
|
|
)
|
|
data = r.json()
|
|
assert data["deleted"]
|
|
assert data["storageQuotaReached"] is False
|
|
|
|
|
|
def test_ensure_deleted(admin_auth_headers, default_org_id, upload_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
|
|
headers=admin_auth_headers,
|
|
)
|
|
results = r.json()
|
|
|
|
for res in results["items"]:
|
|
if res["id"] == upload_id:
|
|
assert False
|
|
|
|
|
|
def test_verify_from_upload_resource_count(
|
|
admin_auth_headers, default_org_id, upload_id_2
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id_2}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
result = r.json()
|
|
|
|
assert "files" not in result
|
|
assert len(result["resources"]) == 3
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id_2}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
|
|
def test_list_all_crawls(
|
|
admin_auth_headers, default_org_id, replaced_upload_id, upload_id_2
|
|
):
|
|
"""Test that /all-crawls lists crawls and uploads before deleting uploads"""
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
items = data["items"]
|
|
|
|
assert len(items) == data["total"]
|
|
|
|
crawls = [item for item in items if item["type"] == "crawl"]
|
|
assert len(crawls) > 0
|
|
|
|
uploads = [item for item in items if item["type"] == "upload"]
|
|
assert len(uploads) > 0
|
|
|
|
for item in items:
|
|
assert item["type"] in ("crawl", "upload")
|
|
|
|
if item["type"] == "crawl":
|
|
assert item["firstSeed"]
|
|
assert item["seedCount"]
|
|
assert item.get("name") or item.get("name") == ""
|
|
|
|
assert item["id"]
|
|
assert item["userid"]
|
|
assert item["oid"] == default_org_id
|
|
assert item["started"]
|
|
assert item["finished"]
|
|
assert item["state"]
|
|
assert item["version"] == 2
|
|
|
|
# Test that all-crawls lastQAState and lastQAStarted sorts always puts crawls before uploads
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=lastQAState",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
last_type = None
|
|
for item in data["items"]:
|
|
if last_type == "upload":
|
|
assert item["type"] != "crawl"
|
|
last_type = item["type"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=lastQAStarted",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
last_type = None
|
|
for item in data["items"]:
|
|
if last_type == "upload":
|
|
assert item["type"] != "crawl"
|
|
last_type = item["type"]
|
|
|
|
|
|
def test_get_all_crawls_by_name(
|
|
admin_auth_headers, default_org_id, replaced_upload_id, upload_id_2
|
|
):
|
|
"""Test filtering /all-crawls by name"""
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?name=test2.wacz",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 1
|
|
items = data["items"]
|
|
assert items[0]["id"] == upload_id_2
|
|
assert items[0]["name"] == "test2.wacz"
|
|
|
|
crawl_name = "Crawler User Test Crawl"
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?name={crawl_name}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
for item in data["items"]:
|
|
assert item["name"] == crawl_name
|
|
|
|
|
|
def test_get_all_crawls_by_first_seed(
|
|
admin_auth_headers,
|
|
default_org_id,
|
|
crawler_crawl_id,
|
|
replaced_upload_id,
|
|
upload_id_2,
|
|
):
|
|
"""Test filtering /all-crawls by first seed"""
|
|
first_seed = "https://webrecorder.net/"
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?firstSeed={first_seed}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
for item in data["items"]:
|
|
assert item["firstSeed"] == first_seed
|
|
|
|
|
|
def test_get_all_crawls_by_type(
|
|
admin_auth_headers, default_org_id, admin_crawl_id, replaced_upload_id, upload_id_2
|
|
):
|
|
"""Test filtering /all-crawls by crawl type"""
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=crawl",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 7
|
|
for item in data["items"]:
|
|
assert item["type"] == "crawl"
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=upload",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
for item in data["items"]:
|
|
assert item["type"] == "upload"
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=invalid",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_crawl_type"
|
|
|
|
|
|
def test_get_all_crawls_by_user(
|
|
admin_auth_headers, default_org_id, crawler_userid, replaced_upload_id, upload_id_2
|
|
):
|
|
"""Test filtering /all-crawls by userid"""
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?userid={crawler_userid}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 6
|
|
for item in data["items"]:
|
|
assert item["userid"] == crawler_userid
|
|
|
|
|
|
def test_get_all_crawls_by_cid(
|
|
admin_auth_headers, default_org_id, all_crawls_config_id
|
|
):
|
|
"""Test filtering /all-crawls by cid"""
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?cid={all_crawls_config_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 1
|
|
assert data["items"][0]["cid"] == all_crawls_config_id
|
|
|
|
|
|
def test_get_all_crawls_by_state(
|
|
admin_auth_headers, default_org_id, admin_crawl_id, replaced_upload_id, upload_id_2
|
|
):
|
|
"""Test filtering /all-crawls by cid"""
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?state=complete,stopped_by_user",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] >= 5
|
|
items = data["items"]
|
|
for item in items:
|
|
assert item["state"] in (
|
|
"complete",
|
|
"stopped_by_user",
|
|
)
|
|
|
|
|
|
def test_get_all_crawls_by_collection_id(
|
|
admin_auth_headers, default_org_id, admin_config_id, all_crawls_crawl_id
|
|
):
|
|
"""Test filtering /all-crawls by collection id"""
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"crawlIds": [all_crawls_crawl_id],
|
|
"name": "all-crawls collection",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
new_coll_id = r.json()["id"]
|
|
assert new_coll_id
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?collectionId={new_coll_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["total"] == 1
|
|
assert r.json()["items"][0]["id"] == all_crawls_crawl_id
|
|
|
|
|
|
def test_sort_all_crawls(
|
|
admin_auth_headers, default_org_id, admin_crawl_id, replaced_upload_id, upload_id_2
|
|
):
|
|
# Sort by started, descending (default)
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["total"] >= 9
|
|
items = data["items"]
|
|
assert len(items) >= 9
|
|
|
|
last_created = None
|
|
for crawl in items:
|
|
if last_created:
|
|
assert crawl["started"] <= last_created
|
|
last_created = crawl["started"]
|
|
|
|
# Sort by started, ascending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started&sortDirection=1",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
items = data["items"]
|
|
|
|
last_created = None
|
|
for crawl in items:
|
|
if last_created:
|
|
assert crawl["started"] >= last_created
|
|
last_created = crawl["started"]
|
|
|
|
# Sort by finished
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=finished",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
items = data["items"]
|
|
|
|
last_finished = None
|
|
for crawl in items:
|
|
if not crawl["finished"]:
|
|
continue
|
|
if last_finished:
|
|
assert crawl["finished"] <= last_finished
|
|
last_finished = crawl["finished"]
|
|
|
|
# Sort by finished, ascending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=finished&sortDirection=1",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
items = data["items"]
|
|
|
|
last_finished = None
|
|
for crawl in items:
|
|
if not crawl["finished"]:
|
|
continue
|
|
if last_finished:
|
|
assert crawl["finished"] >= last_finished
|
|
last_finished = crawl["finished"]
|
|
|
|
# Sort by fileSize
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=fileSize",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
items = data["items"]
|
|
|
|
last_size = None
|
|
for crawl in items:
|
|
if last_size:
|
|
assert crawl["fileSize"] <= last_size
|
|
last_size = crawl["fileSize"]
|
|
|
|
# Sort by fileSize, ascending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=fileSize&sortDirection=1",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
items = data["items"]
|
|
|
|
last_size = None
|
|
for crawl in items:
|
|
if last_size:
|
|
assert crawl["fileSize"] >= last_size
|
|
last_size = crawl["fileSize"]
|
|
|
|
# Invalid sort value
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=invalid",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_sort_by"
|
|
|
|
# Invalid sort_direction value
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started&sortDirection=0",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_sort_direction"
|
|
|
|
|
|
def test_all_crawls_search_values(
|
|
admin_auth_headers, default_org_id, replaced_upload_id, upload_id_2
|
|
):
|
|
"""Test that all-crawls search values return expected results"""
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert len(data["names"]) == 9
|
|
expected_names = [
|
|
"Crawler User Test Crawl",
|
|
"Custom Behavior Logs",
|
|
"My Upload Updated",
|
|
"test2.wacz",
|
|
"All Crawls Test Crawl",
|
|
"Crawler User Crawl for Testing QA",
|
|
"Seed File Test Crawl",
|
|
]
|
|
for expected_name in expected_names:
|
|
assert expected_name in data["names"]
|
|
|
|
assert sorted(data["descriptions"]) == ["Lorem ipsum"]
|
|
assert sorted(data["firstSeeds"]) == [
|
|
"https://old.webrecorder.net/",
|
|
"https://specs.webrecorder.net/",
|
|
"https://webrecorder.net/",
|
|
]
|
|
|
|
# Test filtering by crawls
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values?crawlType=crawl",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert len(data["names"]) == 6
|
|
expected_names = [
|
|
"Admin Test Crawl",
|
|
"All Crawls Test Crawl",
|
|
"Crawler User Crawl for Testing QA",
|
|
"Crawler User Test Crawl",
|
|
"Custom Behavior Logs",
|
|
"Seed File Test Crawl",
|
|
]
|
|
for expected_name in expected_names:
|
|
assert expected_name in data["names"]
|
|
|
|
assert sorted(data["descriptions"]) == ["Lorem ipsum"]
|
|
assert sorted(data["firstSeeds"]) == [
|
|
"https://old.webrecorder.net/",
|
|
"https://specs.webrecorder.net/",
|
|
"https://webrecorder.net/",
|
|
]
|
|
|
|
# Test filtering by uploads
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values?crawlType=upload",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert len(data["names"]) == 3
|
|
expected_names = [
|
|
"My Upload Updated",
|
|
"test2.wacz",
|
|
]
|
|
for expected_name in expected_names:
|
|
assert expected_name in data["names"]
|
|
|
|
assert sorted(data["descriptions"]) == []
|
|
assert sorted(data["firstSeeds"]) == []
|
|
|
|
# Test invalid filter
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values?crawlType=invalid",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_crawl_type"
|
|
|
|
|
|
def test_get_upload_from_all_crawls(admin_auth_headers, default_org_id, upload_id_2):
|
|
"""Test that /all-crawls lists crawls and uploads before deleting uploads"""
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id_2}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["name"] == "test2.wacz"
|
|
|
|
assert "files" not in data
|
|
assert data["resources"]
|
|
|
|
|
|
def test_get_upload_replay_json_from_all_crawls(
|
|
admin_auth_headers, default_org_id, upload_id_2
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id_2}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data
|
|
assert data["id"] == upload_id_2
|
|
assert data["name"] == "test2.wacz"
|
|
assert data["resources"]
|
|
assert data["resources"][0]["path"]
|
|
assert data["resources"][0]["size"]
|
|
assert data["resources"][0]["hash"]
|
|
assert data["errors"] == []
|
|
assert "files" not in data
|
|
assert data["version"] == 2
|
|
|
|
|
|
def test_get_upload_replay_json_admin_from_all_crawls(
|
|
admin_auth_headers, default_org_id, upload_id_2
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/all/all-crawls/{upload_id_2}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data
|
|
assert data["id"] == upload_id_2
|
|
assert data["name"] == "test2.wacz"
|
|
assert data["resources"]
|
|
assert data["resources"][0]["path"]
|
|
assert data["resources"][0]["size"]
|
|
assert data["resources"][0]["hash"]
|
|
assert data["errors"] == []
|
|
assert "files" not in data
|
|
assert data["version"] == 2
|
|
|
|
|
|
def test_update_upload_metadata_all_crawls(
|
|
admin_auth_headers, default_org_id, replaced_upload_id
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{replaced_upload_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["name"] == "My Upload Updated"
|
|
assert not data["tags"]
|
|
assert not data["description"]
|
|
assert len(data["collectionIds"]) == 1
|
|
|
|
# Make new collection
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=admin_auth_headers,
|
|
json={"name": "Patch Update Test Collection 2"},
|
|
)
|
|
patch_coll_id_2 = r.json()["id"]
|
|
|
|
# Submit patch request to update name, tags, and description
|
|
UPDATED_NAME = "New Upload Name 2"
|
|
UPDATED_TAGS = ["wr-test-1-updated-again", "wr-test-2-updated-again"]
|
|
UPDATED_DESC = "Lorem ipsum test note 2."
|
|
UPDATED_COLLECTION_IDS = [patch_coll_id_2]
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{replaced_upload_id}",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"tags": UPDATED_TAGS,
|
|
"description": UPDATED_DESC,
|
|
"name": UPDATED_NAME,
|
|
"collectionIds": UPDATED_COLLECTION_IDS,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["updated"]
|
|
|
|
# Verify update was successful
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{replaced_upload_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
|
|
assert data["description"] == UPDATED_DESC
|
|
assert data["name"] == UPDATED_NAME
|
|
assert data["collectionIds"] == UPDATED_COLLECTION_IDS
|
|
|
|
# Submit patch request to set collections to empty list
|
|
UPDATED_COLLECTION_IDS = []
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{replaced_upload_id}",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"collectionIds": UPDATED_COLLECTION_IDS,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["updated"]
|
|
|
|
# Verify update was successful
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{replaced_upload_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
|
|
assert data["description"] == UPDATED_DESC
|
|
assert data["name"] == UPDATED_NAME
|
|
assert data["collectionIds"] == []
|
|
|
|
|
|
def test_clear_all_presigned_urls(
|
|
admin_auth_headers, crawler_auth_headers, default_org_id
|
|
):
|
|
# All orgs
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/clear-presigned-urls",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 403
|
|
assert r.json()["detail"] == "Not Allowed"
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/clear-presigned-urls",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["success"]
|
|
|
|
# Per-org
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/clear-presigned-urls",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 403
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/clear-presigned-urls",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["success"]
|
|
|
|
|
|
def test_delete_form_upload_and_crawls_from_all_crawls(
|
|
admin_auth_headers,
|
|
crawler_auth_headers,
|
|
default_org_id,
|
|
all_crawls_delete_crawl_ids,
|
|
all_crawls_delete_config_id,
|
|
upload_id_2,
|
|
):
|
|
crawls_to_delete = all_crawls_delete_crawl_ids
|
|
crawls_to_delete.append(upload_id_2)
|
|
|
|
# Get org metrics
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/metrics",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
|
|
org_bytes = data["storageUsedBytes"]
|
|
org_crawl_bytes = data["storageUsedCrawls"]
|
|
org_upload_bytes = data["storageUsedUploads"]
|
|
|
|
# Get workflow and crawl sizes
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{all_crawls_delete_config_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
workflow_size = r.json()["totalSize"]
|
|
|
|
crawl_id_1 = all_crawls_delete_crawl_ids[0]
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_1}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
crawl_1_size = r.json()["fileSize"]
|
|
|
|
crawl_id_2 = all_crawls_delete_crawl_ids[1]
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_2}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
crawl_2_size = r.json()["fileSize"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id_2}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
upload_size = r.json()["fileSize"]
|
|
|
|
combined_crawl_size = crawl_1_size + crawl_2_size
|
|
total_size = combined_crawl_size + upload_size
|
|
|
|
# Verify that non-admin user can't delete another's items
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/delete",
|
|
headers=crawler_auth_headers,
|
|
json={"crawl_ids": crawls_to_delete},
|
|
)
|
|
assert r.status_code == 403
|
|
assert r.json()["detail"] == "not_allowed"
|
|
|
|
# Delete mixed type archived items
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/delete",
|
|
headers=admin_auth_headers,
|
|
json={"crawl_ids": crawls_to_delete},
|
|
)
|
|
data = r.json()
|
|
assert data["deleted"]
|
|
assert data["storageQuotaReached"] is False
|
|
|
|
# Check that org and workflow size figures are as expected
|
|
count = 0
|
|
while count < MAX_ATTEMPTS:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/metrics",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
|
|
all_good = True
|
|
|
|
if data["storageUsedBytes"] != org_bytes - total_size:
|
|
all_good = False
|
|
|
|
if data["storageUsedCrawls"] != org_crawl_bytes - combined_crawl_size:
|
|
all_good = False
|
|
|
|
if data["storageUsedUploads"] != org_upload_bytes - upload_size:
|
|
all_good = False
|
|
|
|
if all_good:
|
|
break
|
|
|
|
if count + 1 == MAX_ATTEMPTS:
|
|
assert data["storageUsedBytes"] == org_bytes - total_size
|
|
assert data["storageUsedCrawls"] == org_crawl_bytes - combined_crawl_size
|
|
assert data["storageUsedUploads"] == org_upload_bytes - upload_size
|
|
|
|
time.sleep(5)
|
|
count += 1
|
|
|
|
count = 0
|
|
while count < MAX_ATTEMPTS:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{all_crawls_delete_config_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
if r.json()["totalSize"] == workflow_size - combined_crawl_size:
|
|
break
|
|
|
|
if count + 1 == MAX_ATTEMPTS:
|
|
assert False
|
|
|
|
time.sleep(10)
|
|
count += 1
|