browsertrix/backend/test/test_collections.py
Tessa Walsh a031fab313
Backend work for public collections (#2198)
Fixes #2182 

This rather large PR adds the rest of what should be needed for public
collections work in the frontend.

New API endpoints include:

- Public collections endpoints: GET, streaming download
- Paginated list of URLs in collection with snapshot (page) info for
each
- Collection endpoint to set home URL
- Collection endpoint to upload thumbnail as stream
- DELETE endpoint to remove collection thumbnail

Changes to existing API endpoints include:

- Paginating public collection list results
- Several `pages` endpoints that previously only supported `/crawls/` in
their path, e.g. `/orgs/{oid}/crawls/all/pages/reAdd`, now support
`/uploads/` and `/all-crawls/` namespaces as well. This is necessitated
by adding pages for uploads to the database (see below). For
`/orgs/{oid}/namespace/all/pages/reAdd`, `crawls` or `uploads` will
serve as a filter to only affect crawls of that given type. Other
endpoints are more liberal at this point, and will perform the same
action regardless of the namespace used in the route (we'll likely want
to change this in a follow-up to be more consistent).
- `/orgs/{oid}/namespace/all/pages/reAdd` now kicks off a background job
rather than doing all of the computation in an asyncio task in the
backend container. The background job additionally updates collection
date ranges, page/size counts, and tags for each collection in the org
after pages have been (re)added.

Other big changes:

- New uploads will now have their pages read into the database!
Collection page counts now also include uploads
- A migration was added to start a background job for each org that will
add the pages for previously-uploaded WACZ files to the database and
update collections accordingly
- Adds a new `ImageFile` subclass of `BaseFile` for thumbnails that we
can use for other user-uploaded image files moving forward, with
separate output models for authenticated and public endpoints
2025-01-13 15:15:48 -08:00

1412 lines
43 KiB
Python

import requests
import os
from uuid import uuid4
from zipfile import ZipFile, ZIP_STORED
from tempfile import TemporaryFile
from .conftest import API_PREFIX, NON_DEFAULT_ORG_NAME, NON_DEFAULT_ORG_SLUG
from .utils import read_in_chunks
COLLECTION_NAME = "Test collection"
PUBLIC_COLLECTION_NAME = "Public Test collection"
UPDATED_NAME = "Updated tést cöllection"
SECOND_COLLECTION_NAME = "second-collection"
DESCRIPTION = "Test description"
CAPTION = "Short caption"
UPDATED_CAPTION = "Updated caption"
NON_PUBLIC_COLL_FIELDS = (
"modified",
"tags",
"homeUrlPageId",
)
NON_PUBLIC_IMAGE_FIELDS = ("originalFilename", "userid", "userName", "created")
_coll_id = None
_second_coll_id = None
_public_coll_id = None
_second_public_coll_id = None
upload_id = None
modified = None
default_org_slug = None
curr_dir = os.path.dirname(os.path.realpath(__file__))
def test_create_collection(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
default_thumbnail_name = "default-thumbnail.jpg"
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [crawler_crawl_id],
"name": COLLECTION_NAME,
"caption": CAPTION,
"defaultThumbnailName": default_thumbnail_name,
},
)
assert r.status_code == 200
data = r.json()
assert data["added"]
assert data["name"] == COLLECTION_NAME
global _coll_id
_coll_id = data["id"]
# Verify crawl in collection
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id in r.json()["collectionIds"]
assert r.json()["collections"] == [{"name": COLLECTION_NAME, "id": _coll_id}]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["name"] == COLLECTION_NAME
assert data["caption"] == CAPTION
assert data["crawlCount"] == 1
assert data["pageCount"] > 0
assert data["totalSize"] > 0
modified = data["modified"]
assert modified
assert modified.endswith("Z")
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["defaultThumbnailName"] == default_thumbnail_name
assert data["allowPublicDownload"]
def test_create_public_collection(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [crawler_crawl_id],
"name": PUBLIC_COLLECTION_NAME,
"caption": CAPTION,
"access": "public",
"allowPublicDownload": False,
},
)
assert r.status_code == 200
data = r.json()
assert data["added"]
assert data["name"] == PUBLIC_COLLECTION_NAME
global _public_coll_id
_public_coll_id = data["id"]
# Verify that it is public
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
headers=crawler_auth_headers,
)
assert r.json()["access"] == "public"
def test_create_collection_taken_name(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [crawler_crawl_id],
"name": COLLECTION_NAME,
},
)
assert r.status_code == 400
assert r.json()["detail"] == "collection_name_taken"
def test_create_collection_empty_name(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [crawler_crawl_id],
"name": "",
},
)
assert r.status_code == 422
def test_update_collection(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={
"description": DESCRIPTION,
"caption": UPDATED_CAPTION,
},
)
assert r.status_code == 200
assert r.json()["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["name"] == COLLECTION_NAME
assert data["description"] == DESCRIPTION
assert data["caption"] == UPDATED_CAPTION
assert data["crawlCount"] == 1
assert data["pageCount"] > 0
assert data["totalSize"] > 0
global modified
modified = data["modified"]
assert modified
assert modified.endswith("Z")
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["defaultThumbnailName"]
def test_rename_collection(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={
"name": UPDATED_NAME,
},
)
assert r.status_code == 200
assert r.json()["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["name"] == UPDATED_NAME
assert data["modified"] >= modified
def test_rename_collection_taken_name(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
# Add second collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [crawler_crawl_id],
"name": SECOND_COLLECTION_NAME,
},
)
assert r.status_code == 200
data = r.json()
assert data["added"]
assert data["name"] == SECOND_COLLECTION_NAME
global _second_coll_id
_second_coll_id = data["id"]
# Try to rename first coll to second collection's name
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={"name": SECOND_COLLECTION_NAME},
)
assert r.status_code == 400
assert r.json()["detail"] == "collection_name_taken"
def test_add_remove_crawl_from_collection(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
# Add crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
json={"crawlIds": [admin_crawl_id]},
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["crawlCount"] == 2
assert data["pageCount"] > 0
assert data["totalSize"] > 0
assert data["modified"] >= modified
assert data["tags"] == ["wr-test-2", "wr-test-1"]
assert data["dateEarliest"]
assert data["dateLatest"]
# Verify it was added
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id in r.json()["collectionIds"]
# Remove crawls
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/remove",
json={"crawlIds": [admin_crawl_id, crawler_crawl_id]},
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["crawlCount"] == 0
assert data["pageCount"] == 0
assert data["totalSize"] == 0
assert data["modified"] >= modified
assert data.get("tags", []) == []
assert data.get("dateEarliest") is None
assert data.get("dateLatest") is None
# Verify they were removed
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id not in r.json()["collectionIds"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id not in r.json()["collectionIds"]
# Add crawls back for further tests
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
json={"crawlIds": [admin_crawl_id, crawler_crawl_id]},
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["crawlCount"] == 2
assert data["pageCount"] > 0
assert data["totalSize"] > 0
assert data["modified"] >= modified
assert data["tags"] == ["wr-test-2", "wr-test-1"]
assert data["dateEarliest"]
assert data["dateLatest"]
def test_get_collection(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["name"] == UPDATED_NAME
assert data["oid"] == default_org_id
assert data["description"] == DESCRIPTION
assert data["caption"] == UPDATED_CAPTION
assert data["crawlCount"] == 2
assert data["pageCount"] > 0
assert data["totalSize"] > 0
assert data["modified"] >= modified
assert data["tags"] == ["wr-test-2", "wr-test-1"]
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["defaultThumbnailName"]
def test_get_collection_replay(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/replay.json",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["name"] == UPDATED_NAME
assert data["oid"] == default_org_id
assert data["description"] == DESCRIPTION
assert data["caption"] == UPDATED_CAPTION
assert data["crawlCount"] == 2
assert data["pageCount"] > 0
assert data["totalSize"] > 0
assert data["modified"] >= modified
assert data["tags"] == ["wr-test-2", "wr-test-1"]
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["defaultThumbnailName"]
resources = data["resources"]
assert resources
for resource in resources:
assert resource["name"]
assert resource["path"]
assert resource["size"]
def test_collection_public(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
headers=crawler_auth_headers,
)
assert r.status_code == 404
# make public and test replay headers
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={
"access": "public",
},
)
assert r.status_code == 200
assert r.json()["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.headers["Access-Control-Allow-Origin"] == "*"
assert r.headers["Access-Control-Allow-Headers"] == "*"
# make unlisted and test replay headers
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={
"access": "unlisted",
},
)
assert r.status_code == 200
assert r.json()["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.headers["Access-Control-Allow-Origin"] == "*"
assert r.headers["Access-Control-Allow-Headers"] == "*"
# make private again
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={
"access": "private",
},
)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
headers=crawler_auth_headers,
)
assert r.status_code == 404
def test_collection_access_invalid_value(crawler_auth_headers, default_org_id):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={
"access": "invalid",
},
)
assert r.status_code == 422
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["access"] == "private"
def test_add_upload_to_collection(crawler_auth_headers, default_org_id):
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test-upload.wacz",
headers=crawler_auth_headers,
data=read_in_chunks(fh),
)
assert r.status_code == 200
assert r.json()["added"]
global upload_id
upload_id = r.json()["id"]
# Add upload
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
json={"crawlIds": [upload_id]},
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["crawlCount"] == 3
assert data["pageCount"] > 0
assert data["totalSize"] > 0
assert data["modified"]
assert data["tags"] == ["wr-test-2", "wr-test-1"]
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["defaultThumbnailName"]
# Verify it was added
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id in r.json()["collectionIds"]
assert r.json()["collections"] == [{"name": UPDATED_NAME, "id": _coll_id}]
def test_download_streaming_collection(crawler_auth_headers, default_org_id):
# Add upload
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/download",
headers=crawler_auth_headers,
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)
fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()
assert len(contents) == 4
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
def test_list_collections(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections", headers=crawler_auth_headers
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert len(items) == 3
first_coll = [coll for coll in items if coll["name"] == UPDATED_NAME][0]
assert first_coll["id"] == _coll_id
assert first_coll["name"] == UPDATED_NAME
assert first_coll["oid"] == default_org_id
assert first_coll["description"] == DESCRIPTION
assert first_coll["caption"] == UPDATED_CAPTION
assert first_coll["crawlCount"] == 3
assert first_coll["pageCount"] > 0
assert first_coll["totalSize"] > 0
assert first_coll["modified"]
assert first_coll["tags"] == ["wr-test-2", "wr-test-1"]
assert first_coll["access"] == "private"
assert first_coll["dateEarliest"]
assert first_coll["dateLatest"]
assert first_coll["defaultThumbnailName"]
second_coll = [coll for coll in items if coll["name"] == SECOND_COLLECTION_NAME][0]
assert second_coll["id"]
assert second_coll["name"] == SECOND_COLLECTION_NAME
assert second_coll["oid"] == default_org_id
assert second_coll.get("description") is None
assert second_coll["crawlCount"] == 1
assert second_coll["pageCount"] > 0
assert second_coll["totalSize"] > 0
assert second_coll["modified"]
assert second_coll["tags"] == ["wr-test-2"]
assert second_coll["access"] == "private"
assert second_coll["dateEarliest"]
assert second_coll["dateLatest"]
def test_remove_upload_from_collection(crawler_auth_headers, default_org_id):
# Remove upload
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/remove",
json={"crawlIds": [upload_id]},
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["crawlCount"] == 2
assert data["pageCount"] > 0
assert data["totalSize"] > 0
assert data["modified"] >= modified
assert data.get("tags") == ["wr-test-2", "wr-test-1"]
# Verify it was removed
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id not in r.json()["collectionIds"]
def test_filter_sort_collections(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
# Test filtering by name
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?name={SECOND_COLLECTION_NAME}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
items = data["items"]
assert len(items) == 1
coll = items[0]
assert coll["id"]
assert coll["name"] == SECOND_COLLECTION_NAME
assert coll["oid"] == default_org_id
assert coll.get("description") is None
# Test filtering by name prefix
name_prefix = SECOND_COLLECTION_NAME[0:4]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?namePrefix={name_prefix}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
items = data["items"]
assert len(items) == 1
coll = items[0]
assert coll["id"]
assert coll["name"] == SECOND_COLLECTION_NAME
assert coll["oid"] == default_org_id
assert coll.get("description") is None
# Test filtering by name prefix (case insensitive)
name_prefix = name_prefix.upper()
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?namePrefix={name_prefix}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
items = data["items"]
assert len(items) == 1
coll = items[0]
assert coll["id"]
assert coll["name"] == SECOND_COLLECTION_NAME
assert coll["oid"] == default_org_id
assert coll.get("description") is None
# Test filtering by access
name_prefix = name_prefix.upper()
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?access=public",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
items = data["items"]
assert len(items) == 1
coll = items[0]
assert coll["id"]
assert coll["name"] == PUBLIC_COLLECTION_NAME
assert coll["oid"] == default_org_id
assert coll.get("description") is None
assert coll["access"] == "public"
# Test sorting by name, ascending (default)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=name",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert items[0]["name"] == PUBLIC_COLLECTION_NAME
assert items[1]["name"] == SECOND_COLLECTION_NAME
assert items[2]["name"] == UPDATED_NAME
# Test sorting by name, descending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=name&sortDirection=-1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert items[0]["name"] == UPDATED_NAME
assert items[1]["name"] == SECOND_COLLECTION_NAME
assert items[2]["name"] == PUBLIC_COLLECTION_NAME
# Test sorting by description, ascending (default)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=description",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert (
items[0]["name"] == SECOND_COLLECTION_NAME
or items[0]["name"] == PUBLIC_COLLECTION_NAME
)
assert items[0].get("description") is None
assert (
items[1]["name"] == PUBLIC_COLLECTION_NAME
or items[1]["name"] == SECOND_COLLECTION_NAME
)
assert items[1]["name"] != items[0]["name"]
assert items[1].get("description") is None
assert items[2]["name"] == UPDATED_NAME
assert items[2]["description"] == DESCRIPTION
# Test sorting by description, descending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=description&sortDirection=-1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert items[0]["name"] == UPDATED_NAME
assert items[0]["description"] == DESCRIPTION
assert (
items[1]["name"] == SECOND_COLLECTION_NAME
or items[1]["name"] == PUBLIC_COLLECTION_NAME
)
assert items[1].get("description") is None
assert (
items[2]["name"] == PUBLIC_COLLECTION_NAME
or items[2]["name"] == SECOND_COLLECTION_NAME
)
assert items[1]["name"] != items[2]["name"]
assert items[2].get("description") is None
# Test sorting by modified, ascending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=modified",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert items[0]["modified"] <= items[1]["modified"]
# Test sorting by modified, descending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=modified&sortDirection=-1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert items[0]["modified"] >= items[1]["modified"]
# Test sorting by size, ascending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=totalSize",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert items[0]["totalSize"] <= items[1]["totalSize"]
# Test sorting by size, descending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=totalSize&sortDirection=-1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert items[0]["totalSize"] >= items[1]["totalSize"]
# Invalid sort value
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=invalid",
headers=crawler_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_sort_by"
# Invalid sort_direction value
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=modified&sortDirection=0",
headers=crawler_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_sort_direction"
def test_list_public_collections(
crawler_auth_headers,
admin_auth_headers,
default_org_id,
non_default_org_id,
crawler_crawl_id,
admin_crawl_id,
):
# Create new public collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [crawler_crawl_id],
"name": "Second public collection",
"description": "Lorem ipsum",
"access": "public",
},
)
assert r.status_code == 200
global _second_public_coll_id
_second_public_coll_id = r.json()["id"]
# Get default org slug
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
global default_org_slug
default_org_slug = data["slug"]
org_name = data["name"]
# Verify that public profile isn't enabled
assert data["enablePublicProfile"] is False
assert data["publicDescription"] == ""
assert data["publicUrl"] == ""
# Try listing public collections without org public profile enabled
r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections")
assert r.status_code == 404
assert r.json()["detail"] == "public_profile_not_found"
# Enable public profile on org
public_description = "This is a test public org!"
public_url = "https://example.com"
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
headers=admin_auth_headers,
json={
"enablePublicProfile": True,
"publicDescription": public_description,
"publicUrl": public_url,
},
)
assert r.status_code == 200
assert r.json()["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["enablePublicProfile"]
assert data["publicDescription"] == public_description
assert data["publicUrl"] == public_url
# List public collections with no auth (no public profile)
r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections")
assert r.status_code == 200
data = r.json()
org_data = data["org"]
assert org_data["name"] == org_name
assert org_data["description"] == public_description
assert org_data["url"] == public_url
collections = data["collections"]
assert len(collections) == 2
for collection in collections:
assert collection["id"] in (_public_coll_id, _second_public_coll_id)
assert collection["oid"]
assert collection["access"] == "public"
assert collection["name"]
assert collection["dateEarliest"]
assert collection["dateLatest"]
assert collection["crawlCount"] > 0
assert collection["pageCount"] > 0
assert collection["totalSize"] > 0
# Test non-existing slug - it should return a 404 but not reveal
# whether or not an org exists with that slug
r = requests.get(f"{API_PREFIX}/public/orgs/nonexistentslug/collections")
assert r.status_code == 404
assert r.json()["detail"] == "public_profile_not_found"
def test_list_public_collections_no_colls(non_default_org_id, admin_auth_headers):
# Test existing org that's not public - should return same 404 as
# if org doesn't exist
r = requests.get(f"{API_PREFIX}/public/orgs/{NON_DEFAULT_ORG_SLUG}/collections")
assert r.status_code == 404
assert r.json()["detail"] == "public_profile_not_found"
# Enable public profile on org with zero public collections
r = requests.post(
f"{API_PREFIX}/orgs/{non_default_org_id}/public-profile",
headers=admin_auth_headers,
json={
"enablePublicProfile": True,
},
)
assert r.status_code == 200
assert r.json()["updated"]
# List public collections with no auth - should still get profile even
# with no public collections
r = requests.get(f"{API_PREFIX}/public/orgs/{NON_DEFAULT_ORG_SLUG}/collections")
assert r.status_code == 200
data = r.json()
assert data["org"]["name"] == NON_DEFAULT_ORG_NAME
assert data["collections"] == []
def test_set_collection_home_url(
crawler_auth_headers, default_org_id, crawler_crawl_id
):
# Get a page id from crawler_crawl_id
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
page = data["items"][0]
assert page
page_id = page["id"]
assert page_id
page_url = page["url"]
page_ts = page["ts"]
# Set page as home url
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/home-url",
headers=crawler_auth_headers,
json={"pageId": page_id},
)
assert r.status_code == 200
assert r.json()["updated"]
# Check that fields were set in collection as expected
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["homeUrl"] == page_url
assert data["homeUrlTs"] == page_ts
assert data["homeUrlPageId"] == page_id
def test_collection_url_list(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/urls",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
urls = data["items"]
assert urls
for url in urls:
assert url["url"]
assert url["count"] >= 1
snapshots = url["snapshots"]
assert snapshots
for snapshot in snapshots:
assert snapshot["pageId"]
assert snapshot["ts"]
assert snapshot["status"]
def test_upload_collection_thumbnail(crawler_auth_headers, default_org_id):
with open(os.path.join(curr_dir, "data", "thumbnail.jpg"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail?filename=thumbnail.jpg",
headers=crawler_auth_headers,
data=read_in_chunks(fh),
)
assert r.status_code == 200
assert r.json()["added"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
thumbnail = r.json()["thumbnail"]
assert thumbnail["name"]
assert thumbnail["path"]
assert thumbnail["hash"]
assert thumbnail["size"] > 0
assert thumbnail["originalFilename"] == "thumbnail.jpg"
assert thumbnail["mime"] == "image/jpeg"
assert thumbnail["userid"]
assert thumbnail["userName"]
assert thumbnail["created"]
def test_set_collection_default_thumbnail(crawler_auth_headers, default_org_id):
default_thumbnail_name = "orange-default.avif"
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
headers=crawler_auth_headers,
json={"defaultThumbnailName": default_thumbnail_name},
)
assert r.status_code == 200
assert r.json()["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _second_public_coll_id
assert data["defaultThumbnailName"] == default_thumbnail_name
def test_list_public_colls_home_url_thumbnail():
# Check we get expected data for each public collection
# and nothing we don't expect
non_public_fields = (
"oid",
"modified",
"crawlCount",
"pageCount",
"totalSize",
"tags",
"access",
"homeUrlPageId",
)
non_public_image_fields = ("originalFilename", "userid", "userName", "created")
r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections")
assert r.status_code == 200
collections = r.json()["collections"]
assert len(collections) == 2
for coll in collections:
assert coll["id"] in (_public_coll_id, _second_public_coll_id)
assert coll["oid"]
assert coll["access"] == "public"
assert coll["name"]
assert coll["resources"]
assert coll["dateEarliest"]
assert coll["dateLatest"]
assert coll["crawlCount"] > 0
assert coll["pageCount"] > 0
assert coll["totalSize"] > 0
for field in NON_PUBLIC_COLL_FIELDS:
assert field not in coll
if coll["id"] == _public_coll_id:
assert coll["allowPublicDownload"] is False
assert coll["caption"] == CAPTION
assert coll["homeUrl"]
assert coll["homeUrlTs"]
thumbnail = coll["thumbnail"]
assert thumbnail
assert thumbnail["name"]
assert thumbnail["path"]
assert thumbnail["hash"]
assert thumbnail["size"]
assert thumbnail["mime"]
for field in NON_PUBLIC_IMAGE_FIELDS:
assert field not in thumbnail
if coll["id"] == _second_public_coll_id:
assert coll["description"]
assert coll["defaultThumbnailName"] == "orange-default.avif"
assert coll["allowPublicDownload"]
def test_get_public_collection(default_org_id):
r = requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}"
)
assert r.status_code == 200
coll = r.json()
assert coll["id"] == _public_coll_id
assert coll["oid"] == default_org_id
assert coll["access"] == "public"
assert coll["name"]
assert coll["resources"]
assert coll["dateEarliest"]
assert coll["dateLatest"]
assert coll["crawlCount"] > 0
assert coll["pageCount"] > 0
assert coll["totalSize"] > 0
for field in NON_PUBLIC_COLL_FIELDS:
assert field not in coll
assert coll["caption"] == CAPTION
assert coll["homeUrl"]
assert coll["homeUrlTs"]
assert coll["allowPublicDownload"] is False
thumbnail = coll["thumbnail"]
assert thumbnail
assert thumbnail["name"]
assert thumbnail["path"]
assert thumbnail["hash"]
assert thumbnail["size"]
assert thumbnail["mime"]
for field in NON_PUBLIC_IMAGE_FIELDS:
assert field not in thumbnail
# Invalid org slug - don't reveal whether org exists or not, use
# same exception as if collection doesn't exist
r = requests.get(
f"{API_PREFIX}/public/orgs/doesntexist/collections/{_public_coll_id}"
)
assert r.status_code == 404
assert r.json()["detail"] == "collection_not_found"
# Invalid collection id
random_uuid = uuid4()
r = requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{random_uuid}"
)
assert r.status_code == 404
assert r.json()["detail"] == "collection_not_found"
# Collection isn't public
r = requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{ _coll_id}"
)
assert r.status_code == 404
assert r.json()["detail"] == "collection_not_found"
def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id):
# Make second public coll unlisted
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
headers=crawler_auth_headers,
json={
"access": "unlisted",
},
)
assert r.status_code == 200
assert r.json()["updated"]
# Verify single public collection GET endpoint works for unlisted collection
r = requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_second_public_coll_id}"
)
assert r.status_code == 200
coll = r.json()
assert coll["id"] == _second_public_coll_id
assert coll["oid"] == default_org_id
assert coll["access"] == "unlisted"
assert coll["name"]
assert coll["resources"]
assert coll["dateEarliest"]
assert coll["dateLatest"]
assert coll["crawlCount"] > 0
assert coll["pageCount"] > 0
assert coll["totalSize"] > 0
assert coll["defaultThumbnailName"] == "orange-default.avif"
assert coll["allowPublicDownload"]
for field in NON_PUBLIC_COLL_FIELDS:
assert field not in coll
def test_get_public_collection_unlisted_org_profile_disabled(
admin_auth_headers, default_org_id
):
# Disable org profile
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
headers=admin_auth_headers,
json={
"enablePublicProfile": False,
},
)
assert r.status_code == 200
assert r.json()["updated"]
# Verify we can still get public details for unlisted collection
r = requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_second_public_coll_id}"
)
assert r.status_code == 200
coll = r.json()
assert coll["id"] == _second_public_coll_id
assert coll["oid"] == default_org_id
assert coll["access"] == "unlisted"
assert coll["name"]
assert coll["resources"]
assert coll["dateEarliest"]
assert coll["dateLatest"]
assert coll["crawlCount"] > 0
assert coll["pageCount"] > 0
assert coll["totalSize"] > 0
assert coll["defaultThumbnailName"] == "orange-default.avif"
assert coll["allowPublicDownload"]
for field in NON_PUBLIC_COLL_FIELDS:
assert field not in coll
# Re-enable org profile
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
headers=admin_auth_headers,
json={
"enablePublicProfile": True,
},
)
assert r.status_code == 200
assert r.json()["updated"]
def test_delete_thumbnail(crawler_auth_headers, default_org_id):
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["deleted"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json().get("thumbnail") is None
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}/thumbnail",
headers=crawler_auth_headers,
)
assert r.status_code == 404
assert r.json()["detail"] == "thumbnail_not_found"
def test_unset_collection_home_url(
crawler_auth_headers, default_org_id, crawler_crawl_id
):
# Unset home url
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/home-url",
headers=crawler_auth_headers,
json={"pageId": None},
)
assert r.status_code == 200
assert r.json()["updated"]
# Check that fields were set in collection as expected
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("homeUrl") is None
assert data.get("homeUrlTs") is None
assert data.get("homeUrlPageId") is None
def test_download_streaming_public_collection(crawler_auth_headers, default_org_id):
# Check that download is blocked if allowPublicDownload is False
with requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}/download",
stream=True,
) as r:
assert r.status_code == 403
# Set allowPublicDownload to True and then check downloading works
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
headers=crawler_auth_headers,
json={
"allowPublicDownload": True,
},
)
assert r.status_code == 200
assert r.json()["updated"]
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}/download",
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)
fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()
assert len(contents) == 2
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
def test_download_streaming_public_collection_profile_disabled(
admin_auth_headers, default_org_id
):
# Disable org public profile and ensure download still works for public collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
headers=admin_auth_headers,
json={
"enablePublicProfile": False,
},
)
assert r.status_code == 200
assert r.json()["updated"]
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}/download",
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)
fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()
assert len(contents) == 2
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id):
# Delete second collection
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]
# Verify collection id was removed from crawl
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _second_coll_id not in r.json()["collectionIds"]
# Make a new empty (no crawls) collection and delete it
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"name": "To delete",
"description": "Deleting a collection with no crawls should work.",
},
)
assert r.status_code == 200
data = r.json()
assert data["added"]
coll_id = data["id"]
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]