browsertrix/backend/test/test_collections.py
Ilya Kreymer 7b2932c582
Add initial pages + pagesQuery endpoint to /replay.json APIs (#2380)
Fixes #2360 

- Adds `initialPages` to /replay.json response for collections, returning
up-to 25 pages (seed pages first, then sorted by capture time).
- Adds `pagesQueryUrl` to /replay.json
- Adds a public pages search endpoint to support public collections.
- Adds `preloadResources`, including list of WACZ files that should
always be loaded, to /replay.json

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2025-02-13 16:53:47 -08:00

1696 lines
53 KiB
Python

import requests
import os
from uuid import uuid4
from zipfile import ZipFile, ZIP_STORED
from tempfile import TemporaryFile
from .conftest import API_PREFIX, NON_DEFAULT_ORG_NAME, NON_DEFAULT_ORG_SLUG
from .utils import read_in_chunks
COLLECTION_NAME = "Test collection"
COLLECTION_SLUG = "test-collection"
PUBLIC_COLLECTION_NAME = "Public Test collection"
PUBLIC_COLLECTION_SLUG = "custom-public-collection-slug"
UPDATED_NAME = "Updated tést cöllection"
UPDATED_SLUG = "updated-test-collection"
SECOND_COLLECTION_NAME = "second-collection"
DESCRIPTION = "Test description"
CAPTION = "Short caption"
UPDATED_CAPTION = "Updated caption"
SECOND_PUBLIC_COLL_SLUG = "second-public-collection"
NON_PUBLIC_COLL_FIELDS = (
"tags",
"homeUrlPageId",
)
NON_PUBLIC_IMAGE_FIELDS = ("originalFilename", "userid", "userName", "created")
_coll_id = None
_second_coll_id = None
_public_coll_id = None
_second_public_coll_id = None
upload_id = None
modified = None
default_org_slug = None
curr_dir = os.path.dirname(os.path.realpath(__file__))
def test_create_collection(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
default_thumbnail_name = "default-thumbnail.jpg"
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [crawler_crawl_id],
"name": COLLECTION_NAME,
"caption": CAPTION,
"defaultThumbnailName": default_thumbnail_name,
},
)
assert r.status_code == 200
data = r.json()
assert data["added"]
assert data["name"] == COLLECTION_NAME
global _coll_id
_coll_id = data["id"]
# Verify crawl in collection
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id in r.json()["collectionIds"]
assert r.json()["collections"] == [{"name": COLLECTION_NAME, "id": _coll_id}]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["name"] == COLLECTION_NAME
assert data["slug"] == COLLECTION_SLUG
assert data["caption"] == CAPTION
assert data["crawlCount"] == 1
assert data["pageCount"] > 0
assert data["uniquePageCount"] > 0
assert data["totalSize"] > 0
modified = data["modified"]
assert modified
assert modified.endswith("Z")
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["defaultThumbnailName"] == default_thumbnail_name
assert data["allowPublicDownload"]
def test_create_public_collection(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [crawler_crawl_id],
"name": PUBLIC_COLLECTION_NAME,
"slug": PUBLIC_COLLECTION_SLUG,
"caption": CAPTION,
"access": "public",
"allowPublicDownload": False,
},
)
assert r.status_code == 200
data = r.json()
assert data["added"]
assert data["name"] == PUBLIC_COLLECTION_NAME
global _public_coll_id
_public_coll_id = data["id"]
# Verify that it is public
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
headers=crawler_auth_headers,
)
assert r.json()["access"] == "public"
def test_create_collection_taken_name(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [crawler_crawl_id],
"name": COLLECTION_NAME,
},
)
assert r.status_code == 400
assert r.json()["detail"] in ("collection_name_taken", "collection_slug_taken")
def test_create_collection_empty_name(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [crawler_crawl_id],
"name": "",
},
)
assert r.status_code == 422
def test_update_collection(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={
"description": DESCRIPTION,
"caption": UPDATED_CAPTION,
},
)
assert r.status_code == 200
assert r.json()["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["name"] == COLLECTION_NAME
assert data["description"] == DESCRIPTION
assert data["caption"] == UPDATED_CAPTION
assert data["crawlCount"] == 1
assert data["pageCount"] > 0
assert data["uniquePageCount"] > 0
assert data["totalSize"] > 0
global modified
modified = data["modified"]
assert modified
assert modified.endswith("Z")
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["defaultThumbnailName"]
def test_rename_collection(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={
"name": UPDATED_NAME,
},
)
assert r.status_code == 200
assert r.json()["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["name"] == UPDATED_NAME
assert data["slug"] == UPDATED_SLUG
assert data["modified"] >= modified
def test_rename_collection_taken_name(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
# Add second collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [crawler_crawl_id],
"name": SECOND_COLLECTION_NAME,
},
)
assert r.status_code == 200
data = r.json()
assert data["added"]
assert data["name"] == SECOND_COLLECTION_NAME
global _second_coll_id
_second_coll_id = data["id"]
# Try to rename first coll to second collection's name
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={"name": SECOND_COLLECTION_NAME},
)
assert r.status_code == 400
assert r.json()["detail"] in ("collection_name_taken", "collection_slug_taken")
# Try to set first coll's slug to value already used for second collection
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={"slug": SECOND_COLLECTION_NAME},
)
assert r.status_code == 400
assert r.json()["detail"] == "collection_slug_taken"
def test_add_remove_crawl_from_collection(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
# Add crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
json={"crawlIds": [admin_crawl_id]},
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["crawlCount"] == 2
assert data["pageCount"] > 0
assert data["uniquePageCount"] > 0
assert data["totalSize"] > 0
assert data["modified"] >= modified
assert data["tags"] == ["wr-test-2", "wr-test-1"]
assert data["dateEarliest"]
assert data["dateLatest"]
# Verify it was added
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id in r.json()["collectionIds"]
# Remove crawls
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/remove",
json={"crawlIds": [admin_crawl_id, crawler_crawl_id]},
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["crawlCount"] == 0
assert data["pageCount"] == 0
assert data["uniquePageCount"] == 0
assert data["totalSize"] == 0
assert data["modified"] >= modified
assert data.get("tags", []) == []
assert data.get("dateEarliest") is None
assert data.get("dateLatest") is None
# Verify they were removed
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id not in r.json()["collectionIds"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id not in r.json()["collectionIds"]
# Add crawls back for further tests
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
json={"crawlIds": [admin_crawl_id, crawler_crawl_id]},
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["crawlCount"] == 2
assert data["pageCount"] > 0
assert data["uniquePageCount"] > 0
assert data["totalSize"] > 0
assert data["modified"] >= modified
assert data["tags"] == ["wr-test-2", "wr-test-1"]
assert data["dateEarliest"]
assert data["dateLatest"]
def test_get_collection(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["name"] == UPDATED_NAME
assert data["slug"] == UPDATED_SLUG
assert data["oid"] == default_org_id
assert data["description"] == DESCRIPTION
assert data["caption"] == UPDATED_CAPTION
assert data["crawlCount"] == 2
assert data["pageCount"] > 0
assert data["uniquePageCount"] > 0
assert data["totalSize"] > 0
assert data["modified"] >= modified
assert data["tags"] == ["wr-test-2", "wr-test-1"]
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["defaultThumbnailName"]
def test_get_collection_replay(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/replay.json",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["name"] == UPDATED_NAME
assert data["slug"] == UPDATED_SLUG
assert data["oid"] == default_org_id
assert data["description"] == DESCRIPTION
assert data["caption"] == UPDATED_CAPTION
assert data["crawlCount"] == 2
assert data["pageCount"] > 0
assert data["uniquePageCount"] > 0
assert data["totalSize"] > 0
assert data["modified"] >= modified
assert data["tags"] == ["wr-test-2", "wr-test-1"]
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["defaultThumbnailName"]
assert data["initialPages"]
assert data["pagesQueryUrl"].endswith(
f"/orgs/{default_org_id}/collections/{_coll_id}/pages"
)
assert "preloadResources" in data
resources = data["resources"]
assert resources
for resource in resources:
assert resource["name"]
assert resource["path"]
assert resource["size"]
def test_collection_public(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
headers=crawler_auth_headers,
)
assert r.status_code == 404
# make public and test replay headers
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={
"access": "public",
},
)
assert r.status_code == 200
assert r.json()["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
assert data["initialPages"]
assert data["pagesQueryUrl"].endswith(
f"/orgs/{default_org_id}/collections/{_coll_id}/public/pages"
)
assert "preloadResources" in data
assert r.status_code == 200
assert r.headers["Access-Control-Allow-Origin"] == "*"
assert r.headers["Access-Control-Allow-Headers"] == "*"
# test public pages endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] > 0
assert data["items"]
# make unlisted and test replay headers
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={
"access": "unlisted",
},
)
assert r.status_code == 200
assert r.json()["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.headers["Access-Control-Allow-Origin"] == "*"
assert r.headers["Access-Control-Allow-Headers"] == "*"
# make private again
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={
"access": "private",
},
)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
headers=crawler_auth_headers,
)
assert r.status_code == 404
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 404
def test_collection_access_invalid_value(crawler_auth_headers, default_org_id):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
json={
"access": "invalid",
},
)
assert r.status_code == 422
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["access"] == "private"
def test_add_upload_to_collection(crawler_auth_headers, default_org_id):
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test-upload.wacz",
headers=crawler_auth_headers,
data=read_in_chunks(fh),
)
assert r.status_code == 200
assert r.json()["added"]
global upload_id
upload_id = r.json()["id"]
# Add upload
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
json={"crawlIds": [upload_id]},
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["crawlCount"] == 3
assert data["pageCount"] > 0
assert data["uniquePageCount"] > 0
assert data["totalSize"] > 0
assert data["modified"]
assert data["tags"] == ["wr-test-2", "wr-test-1"]
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["defaultThumbnailName"]
# Verify it was added
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id in r.json()["collectionIds"]
assert r.json()["collections"] == [{"name": UPDATED_NAME, "id": _coll_id}]
def test_download_streaming_collection(crawler_auth_headers, default_org_id):
# Add upload
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/download",
headers=crawler_auth_headers,
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)
fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()
assert len(contents) == 4
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
def test_list_collections(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections", headers=crawler_auth_headers
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert len(items) == 3
first_coll = [coll for coll in items if coll["name"] == UPDATED_NAME][0]
assert first_coll["id"] == _coll_id
assert first_coll["name"] == UPDATED_NAME
assert first_coll["slug"] == UPDATED_SLUG
assert first_coll["oid"] == default_org_id
assert first_coll["description"] == DESCRIPTION
assert first_coll["caption"] == UPDATED_CAPTION
assert first_coll["crawlCount"] == 3
assert first_coll["pageCount"] > 0
assert first_coll["uniquePageCount"] > 0
assert first_coll["totalSize"] > 0
assert first_coll["modified"]
assert first_coll["tags"] == ["wr-test-2", "wr-test-1"]
assert first_coll["access"] == "private"
assert first_coll["dateEarliest"]
assert first_coll["dateLatest"]
assert first_coll["defaultThumbnailName"]
second_coll = [coll for coll in items if coll["name"] == SECOND_COLLECTION_NAME][0]
assert second_coll["id"]
assert second_coll["name"] == SECOND_COLLECTION_NAME
assert second_coll["slug"] == SECOND_COLLECTION_NAME
assert second_coll["oid"] == default_org_id
assert second_coll.get("description") is None
assert second_coll["crawlCount"] == 1
assert second_coll["pageCount"] > 0
assert second_coll["uniquePageCount"] > 0
assert second_coll["totalSize"] > 0
assert second_coll["modified"]
assert second_coll["tags"] == ["wr-test-2"]
assert second_coll["access"] == "private"
assert second_coll["dateEarliest"]
assert second_coll["dateLatest"]
def test_list_pages_in_collection(crawler_auth_headers, default_org_id):
# Test list endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 0
pages = data["items"]
assert pages
for page in pages:
assert page["id"]
assert page["oid"]
assert page["crawl_id"]
assert page["url"]
assert page["ts"]
assert page.get("title") or page.get("title") is None
assert page.get("loadState") or page.get("loadState") is None
assert page.get("status") or page.get("status") is None
assert page.get("mime") or page.get("mime") is None
assert page["isError"] in (None, True, False)
assert page["isFile"] in (None, True, False)
# Save info for page to test url and urlPrefix filters
coll_page = pages[0]
coll_page_id = coll_page["id"]
coll_page_url = coll_page["url"]
coll_page_ts = coll_page["ts"]
coll_page_title = coll_page["title"]
# Test search filter
partial_title = coll_page_title[:5]
partial_url = coll_page_url[:8]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
for matching_page in data["items"]:
assert (
partial_title in matching_page["title"]
or partial_url in matching_page["url"]
)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_url}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
for matching_page in data["items"]:
assert (
partial_title in matching_page["title"]
or partial_url in matching_page["url"]
)
# Test exact url filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
for matching_page in data["items"]:
assert matching_page["url"] == coll_page_url
# Test exact url and ts filters together
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}&ts={coll_page_ts}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
for matching_page in data["items"]:
assert matching_page["url"] == coll_page_url
assert matching_page["ts"] == coll_page_ts
# Test urlPrefix filter
url_prefix = coll_page_url[:8]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?urlPrefix={url_prefix}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
found_matching_page = False
for page in data["items"]:
if page["id"] == coll_page_id and page["url"] == coll_page_url:
found_matching_page = True
assert found_matching_page
# Test isSeed filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=true",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
for page in data["items"]:
assert page["isSeed"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=false",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
for page in data["items"]:
assert page["isSeed"] is False
# Test depth filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=0",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
for page in data["items"]:
assert page["depth"] == 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
for page in data["items"]:
assert page["depth"] == 1
def test_remove_upload_from_collection(crawler_auth_headers, default_org_id):
# Remove upload
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/remove",
json={"crawlIds": [upload_id]},
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["crawlCount"] == 2
assert data["pageCount"] > 0
assert data["uniquePageCount"] > 0
assert data["totalSize"] > 0
assert data["modified"] >= modified
assert data.get("tags") == ["wr-test-2", "wr-test-1"]
# Verify it was removed
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id not in r.json()["collectionIds"]
def test_filter_sort_collections(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
):
# Test filtering by name
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?name={SECOND_COLLECTION_NAME}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
items = data["items"]
assert len(items) == 1
coll = items[0]
assert coll["id"]
assert coll["name"] == SECOND_COLLECTION_NAME
assert coll["oid"] == default_org_id
assert coll.get("description") is None
# Test filtering by name prefix
name_prefix = SECOND_COLLECTION_NAME[0:4]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?namePrefix={name_prefix}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
items = data["items"]
assert len(items) == 1
coll = items[0]
assert coll["id"]
assert coll["name"] == SECOND_COLLECTION_NAME
assert coll["oid"] == default_org_id
assert coll.get("description") is None
# Test filtering by name prefix (case insensitive)
name_prefix = name_prefix.upper()
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?namePrefix={name_prefix}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
items = data["items"]
assert len(items) == 1
coll = items[0]
assert coll["id"]
assert coll["name"] == SECOND_COLLECTION_NAME
assert coll["oid"] == default_org_id
assert coll.get("description") is None
# Test filtering by access
name_prefix = name_prefix.upper()
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?access=public",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
items = data["items"]
assert len(items) == 1
coll = items[0]
assert coll["id"]
assert coll["name"] == PUBLIC_COLLECTION_NAME
assert coll["oid"] == default_org_id
assert coll.get("description") is None
assert coll["access"] == "public"
# Test sorting by name, ascending (default)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=name",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert items[0]["name"] == PUBLIC_COLLECTION_NAME
assert items[1]["name"] == SECOND_COLLECTION_NAME
assert items[2]["name"] == UPDATED_NAME
# Test sorting by name, descending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=name&sortDirection=-1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert items[0]["name"] == UPDATED_NAME
assert items[1]["name"] == SECOND_COLLECTION_NAME
assert items[2]["name"] == PUBLIC_COLLECTION_NAME
# Test sorting by description, ascending (default)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=description",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert (
items[0]["name"] == SECOND_COLLECTION_NAME
or items[0]["name"] == PUBLIC_COLLECTION_NAME
)
assert items[0].get("description") is None
assert (
items[1]["name"] == PUBLIC_COLLECTION_NAME
or items[1]["name"] == SECOND_COLLECTION_NAME
)
assert items[1]["name"] != items[0]["name"]
assert items[1].get("description") is None
assert items[2]["name"] == UPDATED_NAME
assert items[2]["description"] == DESCRIPTION
# Test sorting by description, descending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=description&sortDirection=-1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert items[0]["name"] == UPDATED_NAME
assert items[0]["description"] == DESCRIPTION
assert (
items[1]["name"] == SECOND_COLLECTION_NAME
or items[1]["name"] == PUBLIC_COLLECTION_NAME
)
assert items[1].get("description") is None
assert (
items[2]["name"] == PUBLIC_COLLECTION_NAME
or items[2]["name"] == SECOND_COLLECTION_NAME
)
assert items[1]["name"] != items[2]["name"]
assert items[2].get("description") is None
# Test sorting by modified, ascending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=modified",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert items[0]["modified"] <= items[1]["modified"]
# Test sorting by modified, descending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=modified&sortDirection=-1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert items[0]["modified"] >= items[1]["modified"]
# Test sorting by size, ascending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=totalSize",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert items[0]["totalSize"] <= items[1]["totalSize"]
# Test sorting by size, descending
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=totalSize&sortDirection=-1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert items[0]["totalSize"] >= items[1]["totalSize"]
# Invalid sort value
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=invalid",
headers=crawler_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_sort_by"
# Invalid sort_direction value
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=modified&sortDirection=0",
headers=crawler_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_sort_direction"
def test_list_public_collections(
crawler_auth_headers,
admin_auth_headers,
default_org_id,
non_default_org_id,
crawler_crawl_id,
admin_crawl_id,
):
# Create new public collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [crawler_crawl_id],
"name": "Second public collection",
"slug": SECOND_PUBLIC_COLL_SLUG,
"description": "Lorem ipsum",
"access": "public",
},
)
assert r.status_code == 200
global _second_public_coll_id
_second_public_coll_id = r.json()["id"]
# Get default org slug
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
global default_org_slug
default_org_slug = data["slug"]
org_name = data["name"]
# Verify that public profile isn't enabled
assert data["enablePublicProfile"] is False
assert data["publicDescription"] == ""
assert data["publicUrl"] == ""
# Try listing public collections without org public profile enabled
r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections")
assert r.status_code == 404
assert r.json()["detail"] == "public_profile_not_found"
# Enable public profile on org
public_description = "This is a test public org!"
public_url = "https://example.com"
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
headers=admin_auth_headers,
json={
"enablePublicProfile": True,
"publicDescription": public_description,
"publicUrl": public_url,
},
)
assert r.status_code == 200
assert r.json()["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["enablePublicProfile"]
assert data["publicDescription"] == public_description
assert data["publicUrl"] == public_url
# List public collections with no auth (no public profile)
r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections")
assert r.status_code == 200
data = r.json()
org_data = data["org"]
assert org_data["name"] == org_name
assert org_data["description"] == public_description
assert org_data["url"] == public_url
collections = data["collections"]
assert len(collections) == 2
for collection in collections:
assert collection["id"] in (_public_coll_id, _second_public_coll_id)
assert collection["oid"]
assert collection["access"] == "public"
assert collection["name"]
assert collection["created"]
assert collection["modified"]
assert collection["slug"]
assert collection["dateEarliest"]
assert collection["dateLatest"]
assert collection["crawlCount"] > 0
assert collection["pageCount"] > 0
assert collection["uniquePageCount"] > 0
assert collection["totalSize"] > 0
# Test non-existing slug - it should return a 404 but not reveal
# whether or not an org exists with that slug
r = requests.get(f"{API_PREFIX}/public/orgs/nonexistentslug/collections")
assert r.status_code == 404
assert r.json()["detail"] == "public_profile_not_found"
def test_list_public_collections_no_colls(non_default_org_id, admin_auth_headers):
# Test existing org that's not public - should return same 404 as
# if org doesn't exist
r = requests.get(f"{API_PREFIX}/public/orgs/{NON_DEFAULT_ORG_SLUG}/collections")
assert r.status_code == 404
assert r.json()["detail"] == "public_profile_not_found"
# Enable public profile on org with zero public collections
r = requests.post(
f"{API_PREFIX}/orgs/{non_default_org_id}/public-profile",
headers=admin_auth_headers,
json={
"enablePublicProfile": True,
},
)
assert r.status_code == 200
assert r.json()["updated"]
# List public collections with no auth - should still get profile even
# with no public collections
r = requests.get(f"{API_PREFIX}/public/orgs/{NON_DEFAULT_ORG_SLUG}/collections")
assert r.status_code == 200
data = r.json()
assert data["org"]["name"] == NON_DEFAULT_ORG_NAME
assert data["collections"] == []
def test_set_collection_home_url(
crawler_auth_headers, default_org_id, crawler_crawl_id
):
# Get a page id from crawler_crawl_id
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
page = data["items"][0]
assert page
page_id = page["id"]
assert page_id
page_url = page["url"]
page_ts = page["ts"]
# Set page as home url
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/home-url",
headers=crawler_auth_headers,
json={"pageId": page_id},
)
assert r.status_code == 200
assert r.json()["updated"]
# Check that fields were set in collection as expected
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["homeUrl"] == page_url
assert data["homeUrlTs"] == page_ts
assert data["homeUrlPageId"] == page_id
def test_collection_url_list(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/urls",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
urls = data["items"]
assert urls
for url in urls:
assert url["url"]
assert url["count"] >= 1
snapshots = url["snapshots"]
assert snapshots
for snapshot in snapshots:
assert snapshot["pageId"]
assert snapshot["ts"]
assert snapshot["status"]
def test_upload_collection_thumbnail(crawler_auth_headers, default_org_id):
# https://dev.browsertrix.com/api/orgs/c69247f4-415e-4abc-b449-e85d2f26c626/collections/b764fbe1-baab-4dc5-8dca-2db6f82c250b/thumbnail?filename=page-thumbnail_47fe599e-ed62-4edd-b078-93d4bf281e0f.jpeg&sourceUrl=https%3A%2F%2Fspecs.webrecorder.net%2F&sourceTs=2024-08-16T08%3A00%3A21.601000Z&sourcePageId=47fe599e-ed62-4edd-b078-93d4bf281e0f
with open(os.path.join(curr_dir, "data", "thumbnail.jpg"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail?filename=thumbnail.jpg&sourceUrl=https%3A%2F%2Fexample.com%2F&sourceTs=2024-08-16T08%3A00%3A21.601000Z&sourcePageId=1bba4aba-d5be-4943-ad48-d6710633d754",
headers=crawler_auth_headers,
data=read_in_chunks(fh),
)
assert r.status_code == 200
assert r.json()["added"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
collection = r.json()
thumbnail = collection["thumbnail"]
assert thumbnail["name"]
assert thumbnail["path"]
assert thumbnail["hash"]
assert thumbnail["size"] > 0
assert thumbnail["originalFilename"] == "thumbnail.jpg"
assert thumbnail["mime"] == "image/jpeg"
assert thumbnail["userid"]
assert thumbnail["userName"]
assert thumbnail["created"]
thumbnailSource = collection["thumbnailSource"]
assert thumbnailSource["url"]
assert thumbnailSource["urlTs"]
assert thumbnailSource["urlPageId"]
assert thumbnailSource["url"] == "https://example.com/"
assert thumbnailSource["urlTs"] == "2024-08-16T08:00:21.601000Z"
assert thumbnailSource["urlPageId"] == "1bba4aba-d5be-4943-ad48-d6710633d754"
def test_set_collection_default_thumbnail(crawler_auth_headers, default_org_id):
default_thumbnail_name = "orange-default.avif"
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
headers=crawler_auth_headers,
json={"defaultThumbnailName": default_thumbnail_name},
)
assert r.status_code == 200
assert r.json()["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _second_public_coll_id
assert data["defaultThumbnailName"] == default_thumbnail_name
def test_list_public_colls_home_url_thumbnail():
# Check we get expected data for each public collection
# and nothing we don't expect
non_public_fields = (
"tags",
"homeUrlPageId",
)
non_public_image_fields = ("originalFilename", "userid", "userName", "created")
r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections")
assert r.status_code == 200
collections = r.json()["collections"]
assert len(collections) == 2
for coll in collections:
assert coll["id"] in (_public_coll_id, _second_public_coll_id)
assert coll["oid"]
assert coll["access"] == "public"
assert coll["name"]
assert coll["created"]
assert coll["modified"]
assert coll["resources"]
assert coll["dateEarliest"]
assert coll["dateLatest"]
assert coll["crawlCount"] > 0
assert coll["pageCount"] > 0
assert coll["uniquePageCount"] > 0
assert coll["totalSize"] > 0
for field in non_public_fields:
assert field not in coll
if coll["id"] == _public_coll_id:
assert coll["allowPublicDownload"] is False
assert coll["caption"] == CAPTION
assert coll["homeUrl"]
assert coll["homeUrlTs"]
thumbnail = coll["thumbnail"]
assert thumbnail
assert thumbnail["name"]
assert thumbnail["path"]
assert thumbnail["hash"]
assert thumbnail["size"]
assert thumbnail["mime"]
for field in non_public_image_fields:
assert field not in thumbnail
if coll["id"] == _second_public_coll_id:
assert coll["description"]
assert coll["defaultThumbnailName"] == "orange-default.avif"
assert coll["allowPublicDownload"]
def test_get_public_collection(default_org_id):
r = requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}"
)
assert r.status_code == 200
coll = r.json()
assert coll["id"] == _public_coll_id
assert coll["oid"] == default_org_id
assert coll["access"] == "public"
assert coll["name"]
assert coll["created"]
assert coll["modified"]
assert coll["slug"] == PUBLIC_COLLECTION_SLUG
assert coll["resources"]
assert coll["dateEarliest"]
assert coll["dateLatest"]
assert coll["crawlCount"] > 0
assert coll["pageCount"] > 0
assert coll["uniquePageCount"] > 0
assert coll["totalSize"] > 0
for field in NON_PUBLIC_COLL_FIELDS:
assert field not in coll
assert coll["caption"] == CAPTION
assert coll["homeUrl"]
assert coll["homeUrlTs"]
assert coll["allowPublicDownload"] is False
thumbnail = coll["thumbnail"]
assert thumbnail
assert thumbnail["name"]
assert thumbnail["path"]
assert thumbnail["hash"]
assert thumbnail["size"]
assert thumbnail["mime"]
for field in NON_PUBLIC_IMAGE_FIELDS:
assert field not in thumbnail
# Invalid org slug - don't reveal whether org exists or not, use
# same exception as if collection doesn't exist
r = requests.get(
f"{API_PREFIX}/public/orgs/doesntexist/collections/{PUBLIC_COLLECTION_SLUG}"
)
assert r.status_code == 404
assert r.json()["detail"] == "collection_not_found"
# Unused slug
random_uuid = uuid4()
r = requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/someslugnotinuse"
)
assert r.status_code == 404
assert r.json()["detail"] == "collection_not_found"
# Collection isn't public
r = requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{UPDATED_SLUG}"
)
assert r.status_code == 404
assert r.json()["detail"] == "collection_not_found"
def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id):
# Make second public coll unlisted
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
headers=crawler_auth_headers,
json={
"access": "unlisted",
},
)
assert r.status_code == 200
assert r.json()["updated"]
# Verify single public collection GET endpoint works for unlisted collection
r = requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{SECOND_PUBLIC_COLL_SLUG}"
)
assert r.status_code == 200
coll = r.json()
assert coll["id"] == _second_public_coll_id
assert coll["oid"] == default_org_id
assert coll["access"] == "unlisted"
assert coll["name"]
assert coll["created"]
assert coll["modified"]
assert coll["slug"] == SECOND_PUBLIC_COLL_SLUG
assert coll["resources"]
assert coll["dateEarliest"]
assert coll["dateLatest"]
assert coll["crawlCount"] > 0
assert coll["pageCount"] > 0
assert coll["uniquePageCount"] > 0
assert coll["totalSize"] > 0
assert coll["defaultThumbnailName"] == "orange-default.avif"
assert coll["allowPublicDownload"]
for field in NON_PUBLIC_COLL_FIELDS:
assert field not in coll
def test_get_public_collection_unlisted_org_profile_disabled(
admin_auth_headers, default_org_id
):
# Disable org profile
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
headers=admin_auth_headers,
json={
"enablePublicProfile": False,
},
)
assert r.status_code == 200
assert r.json()["updated"]
# Verify we can still get public details for unlisted collection
r = requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{SECOND_PUBLIC_COLL_SLUG}"
)
assert r.status_code == 200
coll = r.json()
assert coll["id"] == _second_public_coll_id
assert coll["oid"] == default_org_id
assert coll["access"] == "unlisted"
assert coll["name"]
assert coll["created"]
assert coll["modified"]
assert coll["slug"]
assert coll["resources"]
assert coll["dateEarliest"]
assert coll["dateLatest"]
assert coll["crawlCount"] > 0
assert coll["pageCount"] > 0
assert coll["uniquePageCount"] > 0
assert coll["totalSize"] > 0
assert coll["defaultThumbnailName"] == "orange-default.avif"
assert coll["allowPublicDownload"]
for field in NON_PUBLIC_COLL_FIELDS:
assert field not in coll
# Re-enable org profile
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
headers=admin_auth_headers,
json={
"enablePublicProfile": True,
},
)
assert r.status_code == 200
assert r.json()["updated"]
def test_delete_thumbnail(crawler_auth_headers, default_org_id):
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["deleted"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json().get("thumbnail") is None
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}/thumbnail",
headers=crawler_auth_headers,
)
assert r.status_code == 404
assert r.json()["detail"] == "thumbnail_not_found"
def test_unset_collection_home_url(
crawler_auth_headers, default_org_id, crawler_crawl_id
):
# Unset home url
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/home-url",
headers=crawler_auth_headers,
json={"pageId": None},
)
assert r.status_code == 200
assert r.json()["updated"]
# Check that fields were set in collection as expected
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("homeUrl") is None
assert data.get("homeUrlTs") is None
assert data.get("homeUrlPageId") is None
def test_download_streaming_public_collection(crawler_auth_headers, default_org_id):
# Check that download is blocked if allowPublicDownload is False
with requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}/download",
stream=True,
) as r:
assert r.status_code == 403
# Set allowPublicDownload to True and then check downloading works
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
headers=crawler_auth_headers,
json={
"allowPublicDownload": True,
},
)
assert r.status_code == 200
assert r.json()["updated"]
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}/download",
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)
fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()
assert len(contents) == 2
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
def test_download_streaming_public_collection_profile_disabled(
admin_auth_headers, default_org_id
):
# Disable org public profile and ensure download still works for public collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
headers=admin_auth_headers,
json={
"enablePublicProfile": False,
},
)
assert r.status_code == 200
assert r.json()["updated"]
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}/download",
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)
fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()
assert len(contents) == 2
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
def test_get_public_collection_slug_redirect(admin_auth_headers, default_org_id):
# Update public collection slug
new_slug = "new-slug"
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
headers=admin_auth_headers,
json={
"slug": new_slug,
},
)
assert r.status_code == 200
assert r.json()["updated"]
# Get public collection from previous slug
r = requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}"
)
assert r.status_code == 200
coll = r.json()
assert coll["id"] == _public_coll_id
assert coll["oid"] == default_org_id
assert coll["slug"] == new_slug
# Rename second public collection slug to now-unused PUBLIC_COLLECTION_SLUG
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
headers=admin_auth_headers,
json={
"slug": PUBLIC_COLLECTION_SLUG,
},
)
assert r.status_code == 200
assert r.json()["updated"]
# Delete second public collection
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]
# Verify that trying to go to PUBLIC_COLLECTION_SLUG now 404s instead of taking
# us to the collection that had the slug before it was reassigned
r = requests.get(
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}"
)
assert r.status_code == 404
def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id):
# Delete second collection
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]
# Verify collection id was removed from crawl
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _second_coll_id not in r.json()["collectionIds"]
# Make a new empty (no crawls) collection and delete it
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"name": "To delete",
"description": "Deleting a collection with no crawls should work.",
},
)
assert r.status_code == 200
data = r.json()
assert data["added"]
coll_id = data["id"]
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{coll_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]