browsertrix/backend/test/test_uploads.py
Ilya Kreymer 00eb62214d
Uploads API: BaseCrawl refactor + Initial support for /uploads endpoint (#937)
* basecrawl refactor: make crawls db more generic, supporting different types of 'base crawls': crawls, uploads, manual archives
- move shared functionality to basecrawl.py
- create a base BaseCrawl object, which contains start / finish time, metadata and files array
- create BaseCrawlOps, base class for CrawlOps, which supports base crawl deletion, querying and collection add/remove

* uploads api: (part of #929)
- new UploadCrawl object which extends BaseCrawl, has name and description
- support multipart form data data upload to /uploads/formdata
- support streaming upload of a single file via /uploads/stream, using botocore multipart upload to upload to s3-endpoint in parts
- require 'filename' param to set upload filename for streaming uploads (otherwise use form data names)
- sanitize filename, place uploads in /uploads/<uuid>/<sanitized-filename>-<random>.wacz
- uploads have internal id 'upload-<uuid>'
- create UploadedCrawl object with CrawlFiles pointing to the newly uploaded files, set state to 'complete'
- handle upload failures, abort multipart upload
- ensure uploads added within org bucket path
- return id / added when adding new UploadedCrawl
- support listing, deleting, and patch /uploads
- support upload details via /replay.json to support for replay
- add support for 'replaceId=<id>', which would remove all previous files in upload after new upload succeeds. if replaceId doesn't exist, create new upload. (only for stream endpoint so far).
- support patching upload metadata: notes, tags and name on uploads (UpdateUpload extends UpdateCrawl and adds 'name')

* base crawls api: Add /all-crawls list and delete endpoints for all crawl types (without resources)
- support all-crawls/<id>/replay.json with resources
- Use ListCrawlOut model for /all-crawls list endpoint
- Extend BaseCrawlOut from ListCrawlOut, add type
- use 'type: crawl' for crawls and 'type: upload' for uploads
- migration: ensure all previous crawl objects / missing type are set to 'type: crawl'
- indexes: add db indices on 'type' field and with 'type' field and oid, cid, finished, state

* tests: add test for multipart and streaming upload, listing uploads, deleting upload
- add sample WACZ for upload testing: 'example.wacz' and 'example-2.wacz'

* collections: support adding and remove both crawls and uploads via base crawl
- include collection_ids in /all-crawls list
- collections replay.json can include both crawls and uploads

bump version to 1.6.0-beta.2
---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-07-07 09:13:26 -07:00

392 lines
11 KiB
Python

import requests
import os
from urllib.parse import urljoin
from .conftest import API_PREFIX
from .utils import read_in_chunks
upload_id = None
upload_id_2 = None
upload_dl_path = None
curr_dir = os.path.dirname(os.path.realpath(__file__))
def test_upload_stream(admin_auth_headers, default_org_id):
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload&notes=Testing%0AData",
headers=admin_auth_headers,
data=read_in_chunks(fh),
)
assert r.status_code == 200
assert r.json()["added"]
global upload_id
upload_id = r.json()["id"]
def test_list_stream_upload(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers,
)
results = r.json()
assert len(results["items"]) > 0
found = None
for res in results["items"]:
if res["id"] == upload_id:
found = res
assert found
assert found["name"] == "My Upload"
assert found["notes"] == "Testing\nData"
assert "files" not in found
assert "resources" not in found
def test_get_stream_upload(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
result = r.json()
assert "files" not in result
upload_dl_path = result["resources"][0]["path"]
assert "test-" in result["resources"][0]["name"]
assert result["resources"][0]["name"].endswith(".wacz")
dl_path = urljoin(API_PREFIX, upload_dl_path)
wacz_resp = requests.get(dl_path)
actual = wacz_resp.content
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
expected = fh.read()
assert len(actual) == len(expected)
assert actual == expected
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
def test_get_upload_replay_json(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data
assert data["id"] == upload_id
assert data["name"] == "My Upload"
assert data["resources"]
assert data["resources"][0]["path"]
assert data["resources"][0]["size"]
assert data["resources"][0]["hash"]
assert "files" not in data
assert "errors" not in data or data.get("errors") is None
def test_get_upload_replay_json_admin(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/all/uploads/{upload_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data
assert data["id"] == upload_id
assert data["name"] == "My Upload"
assert data["resources"]
assert data["resources"][0]["path"]
assert data["resources"][0]["size"]
assert data["resources"][0]["hash"]
assert "files" not in data
assert "errors" not in data or data.get("errors") is None
def test_replace_upload(admin_auth_headers, default_org_id):
actual_id = do_upload_replace(admin_auth_headers, default_org_id, upload_id)
assert upload_id == actual_id
def do_upload_replace(admin_auth_headers, default_org_id, upload_id):
with open(os.path.join(curr_dir, "data", "example-2.wacz"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload%20Updated&replaceId={upload_id}",
headers=admin_auth_headers,
data=read_in_chunks(fh),
)
assert r.status_code == 200
assert r.json()["added"]
actual_id = r.json()["id"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{actual_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
result = r.json()
# only one file, previous file removed
assert len(result["resources"]) == 1
dl_path = urljoin(API_PREFIX, result["resources"][0]["path"])
wacz_resp = requests.get(dl_path)
actual = wacz_resp.content
with open(os.path.join(curr_dir, "data", "example-2.wacz"), "rb") as fh:
expected = fh.read()
assert len(actual) == len(expected)
assert actual == expected
return actual_id
def test_update_upload_metadata(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["name"] == "My Upload Updated"
assert not data["tags"]
assert not data["notes"]
# Submit patch request to update name, tags, and notes
UPDATED_NAME = "New Upload Name"
UPDATED_TAGS = ["wr-test-1-updated", "wr-test-2-updated"]
UPDATED_NOTES = "Lorem ipsum test note."
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}",
headers=admin_auth_headers,
json={"tags": UPDATED_TAGS, "notes": UPDATED_NOTES, "name": UPDATED_NAME},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
# Verify update was successful
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
assert data["notes"] == UPDATED_NOTES
assert data["name"] == UPDATED_NAME
def test_delete_stream_upload(admin_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete",
headers=admin_auth_headers,
json={"crawl_ids": [upload_id]},
)
assert r.json()["deleted"] == True
def test_replace_upload_non_existent(admin_auth_headers, default_org_id):
global upload_id
# same replacement, but now to a non-existent upload
actual_id = do_upload_replace(admin_auth_headers, default_org_id, upload_id)
# new upload_id created
assert actual_id != upload_id
upload_id = actual_id
def test_delete_stream_upload_2(admin_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete",
headers=admin_auth_headers,
json={"crawl_ids": [upload_id]},
)
assert r.json()["deleted"] == True
def test_upload_form(admin_auth_headers, default_org_id):
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
data = fh.read()
files = [
("uploads", ("test.wacz", data, "application/octet-stream")),
("uploads", ("test-2.wacz", data, "application/octet-stream")),
("uploads", ("test.wacz", data, "application/octet-stream")),
]
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/formdata?name=test2.wacz",
headers=admin_auth_headers,
files=files,
)
assert r.status_code == 200
assert r.json()["added"]
global upload_id_2
upload_id_2 = r.json()["id"]
def test_list_form_upload(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers,
)
results = r.json()
assert len(results["items"]) > 0
found = None
for res in results["items"]:
if res["id"] == upload_id_2:
found = res
assert found
assert found["name"] == "test2.wacz"
assert "files" not in res
assert "resources" not in res
def test_verify_from_upload_resource_count(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id_2}",
headers=admin_auth_headers,
)
assert r.status_code == 200
result = r.json()
assert "files" not in result
assert len(result["resources"]) == 3
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id_2}",
headers=admin_auth_headers,
)
assert r.status_code == 200
def test_list_all_crawls(admin_auth_headers, default_org_id):
"""Test that /all-crawls lists crawls and uploads before deleting uploads"""
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
items = data["items"]
assert len(items) == data["total"]
crawls = [item for item in items if item["type"] == "crawl"]
assert len(crawls) > 0
uploads = [item for item in items if item["type"] == "upload"]
assert len(uploads) > 0
for item in items:
assert item["type"] in ("crawl", "upload")
assert item["id"]
assert item["userid"]
assert item["oid"] == default_org_id
assert item["started"]
assert item["finished"]
assert item["state"]
def test_get_upload_from_all_crawls(admin_auth_headers, default_org_id):
"""Test that /all-crawls lists crawls and uploads before deleting uploads"""
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id_2}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["name"] == "test2.wacz"
assert "files" not in data
assert data["resources"]
def test_get_upload_replay_json_from_all_crawls(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id_2}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data
assert data["id"] == upload_id_2
assert data["name"] == "test2.wacz"
assert data["resources"]
assert data["resources"][0]["path"]
assert data["resources"][0]["size"]
assert data["resources"][0]["hash"]
assert "files" not in data
assert "errors" not in data or data.get("errors") is None
def test_get_upload_replay_json_admin_from_all_crawls(
admin_auth_headers, default_org_id
):
r = requests.get(
f"{API_PREFIX}/orgs/all/all-crawls/{upload_id_2}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data
assert data["id"] == upload_id_2
assert data["name"] == "test2.wacz"
assert data["resources"]
assert data["resources"][0]["path"]
assert data["resources"][0]["size"]
assert data["resources"][0]["hash"]
assert "files" not in data
assert "errors" not in data or data.get("errors") is None
def test_delete_form_upload_from_all_crawls(admin_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/delete",
headers=admin_auth_headers,
json={"crawl_ids": [upload_id_2]},
)
assert r.json()["deleted"] == True
def test_ensure_deleted(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers,
)
results = r.json()
for res in results["items"]:
if res["id"] in (upload_id_2, upload_id):
assert False