Fixes #2360 - Adds `initialPages` to /replay.json response for collections, returning up-to 25 pages (seed pages first, then sorted by capture time). - Adds `pagesQueryUrl` to /replay.json - Adds a public pages search endpoint to support public collections. - Adds `preloadResources`, including list of WACZ files that should always be loaded, to /replay.json --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
1696 lines
53 KiB
Python
1696 lines
53 KiB
Python
import requests
|
|
import os
|
|
from uuid import uuid4
|
|
|
|
from zipfile import ZipFile, ZIP_STORED
|
|
from tempfile import TemporaryFile
|
|
|
|
from .conftest import API_PREFIX, NON_DEFAULT_ORG_NAME, NON_DEFAULT_ORG_SLUG
|
|
from .utils import read_in_chunks
|
|
|
|
COLLECTION_NAME = "Test collection"
|
|
COLLECTION_SLUG = "test-collection"
|
|
PUBLIC_COLLECTION_NAME = "Public Test collection"
|
|
PUBLIC_COLLECTION_SLUG = "custom-public-collection-slug"
|
|
UPDATED_NAME = "Updated tést cöllection"
|
|
UPDATED_SLUG = "updated-test-collection"
|
|
SECOND_COLLECTION_NAME = "second-collection"
|
|
DESCRIPTION = "Test description"
|
|
CAPTION = "Short caption"
|
|
UPDATED_CAPTION = "Updated caption"
|
|
SECOND_PUBLIC_COLL_SLUG = "second-public-collection"
|
|
|
|
NON_PUBLIC_COLL_FIELDS = (
|
|
"tags",
|
|
"homeUrlPageId",
|
|
)
|
|
NON_PUBLIC_IMAGE_FIELDS = ("originalFilename", "userid", "userName", "created")
|
|
|
|
|
|
_coll_id = None
|
|
_second_coll_id = None
|
|
_public_coll_id = None
|
|
_second_public_coll_id = None
|
|
upload_id = None
|
|
modified = None
|
|
default_org_slug = None
|
|
|
|
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
|
|
def test_create_collection(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
default_thumbnail_name = "default-thumbnail.jpg"
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"crawlIds": [crawler_crawl_id],
|
|
"name": COLLECTION_NAME,
|
|
"caption": CAPTION,
|
|
"defaultThumbnailName": default_thumbnail_name,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["added"]
|
|
assert data["name"] == COLLECTION_NAME
|
|
|
|
global _coll_id
|
|
_coll_id = data["id"]
|
|
|
|
# Verify crawl in collection
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert _coll_id in r.json()["collectionIds"]
|
|
assert r.json()["collections"] == [{"name": COLLECTION_NAME, "id": _coll_id}]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == _coll_id
|
|
assert data["name"] == COLLECTION_NAME
|
|
assert data["slug"] == COLLECTION_SLUG
|
|
assert data["caption"] == CAPTION
|
|
assert data["crawlCount"] == 1
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
modified = data["modified"]
|
|
assert modified
|
|
assert modified.endswith("Z")
|
|
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
|
|
assert data["defaultThumbnailName"] == default_thumbnail_name
|
|
assert data["allowPublicDownload"]
|
|
|
|
|
|
def test_create_public_collection(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"crawlIds": [crawler_crawl_id],
|
|
"name": PUBLIC_COLLECTION_NAME,
|
|
"slug": PUBLIC_COLLECTION_SLUG,
|
|
"caption": CAPTION,
|
|
"access": "public",
|
|
"allowPublicDownload": False,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["added"]
|
|
assert data["name"] == PUBLIC_COLLECTION_NAME
|
|
|
|
global _public_coll_id
|
|
_public_coll_id = data["id"]
|
|
|
|
# Verify that it is public
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.json()["access"] == "public"
|
|
|
|
|
|
def test_create_collection_taken_name(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"crawlIds": [crawler_crawl_id],
|
|
"name": COLLECTION_NAME,
|
|
},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] in ("collection_name_taken", "collection_slug_taken")
|
|
|
|
|
|
def test_create_collection_empty_name(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"crawlIds": [crawler_crawl_id],
|
|
"name": "",
|
|
},
|
|
)
|
|
assert r.status_code == 422
|
|
|
|
|
|
def test_update_collection(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"description": DESCRIPTION,
|
|
"caption": UPDATED_CAPTION,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == _coll_id
|
|
assert data["name"] == COLLECTION_NAME
|
|
assert data["description"] == DESCRIPTION
|
|
assert data["caption"] == UPDATED_CAPTION
|
|
assert data["crawlCount"] == 1
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
global modified
|
|
modified = data["modified"]
|
|
assert modified
|
|
assert modified.endswith("Z")
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
assert data["defaultThumbnailName"]
|
|
|
|
|
|
def test_rename_collection(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"name": UPDATED_NAME,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == _coll_id
|
|
assert data["name"] == UPDATED_NAME
|
|
assert data["slug"] == UPDATED_SLUG
|
|
assert data["modified"] >= modified
|
|
|
|
|
|
def test_rename_collection_taken_name(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
# Add second collection
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"crawlIds": [crawler_crawl_id],
|
|
"name": SECOND_COLLECTION_NAME,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["added"]
|
|
assert data["name"] == SECOND_COLLECTION_NAME
|
|
|
|
global _second_coll_id
|
|
_second_coll_id = data["id"]
|
|
|
|
# Try to rename first coll to second collection's name
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={"name": SECOND_COLLECTION_NAME},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] in ("collection_name_taken", "collection_slug_taken")
|
|
|
|
# Try to set first coll's slug to value already used for second collection
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={"slug": SECOND_COLLECTION_NAME},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "collection_slug_taken"
|
|
|
|
|
|
def test_add_remove_crawl_from_collection(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
# Add crawl
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
|
|
json={"crawlIds": [admin_crawl_id]},
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["id"] == _coll_id
|
|
assert data["crawlCount"] == 2
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
assert data["modified"] >= modified
|
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
|
|
# Verify it was added
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert _coll_id in r.json()["collectionIds"]
|
|
|
|
# Remove crawls
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/remove",
|
|
json={"crawlIds": [admin_crawl_id, crawler_crawl_id]},
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["id"] == _coll_id
|
|
assert data["crawlCount"] == 0
|
|
assert data["pageCount"] == 0
|
|
assert data["uniquePageCount"] == 0
|
|
assert data["totalSize"] == 0
|
|
assert data["modified"] >= modified
|
|
assert data.get("tags", []) == []
|
|
assert data.get("dateEarliest") is None
|
|
assert data.get("dateLatest") is None
|
|
|
|
# Verify they were removed
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert _coll_id not in r.json()["collectionIds"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert _coll_id not in r.json()["collectionIds"]
|
|
|
|
# Add crawls back for further tests
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
|
|
json={"crawlIds": [admin_crawl_id, crawler_crawl_id]},
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["id"] == _coll_id
|
|
assert data["crawlCount"] == 2
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
assert data["modified"] >= modified
|
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
|
|
|
|
def test_get_collection(crawler_auth_headers, default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["id"] == _coll_id
|
|
assert data["name"] == UPDATED_NAME
|
|
assert data["slug"] == UPDATED_SLUG
|
|
assert data["oid"] == default_org_id
|
|
assert data["description"] == DESCRIPTION
|
|
assert data["caption"] == UPDATED_CAPTION
|
|
assert data["crawlCount"] == 2
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
assert data["modified"] >= modified
|
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
assert data["defaultThumbnailName"]
|
|
|
|
|
|
def test_get_collection_replay(crawler_auth_headers, default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["id"] == _coll_id
|
|
assert data["name"] == UPDATED_NAME
|
|
assert data["slug"] == UPDATED_SLUG
|
|
assert data["oid"] == default_org_id
|
|
assert data["description"] == DESCRIPTION
|
|
assert data["caption"] == UPDATED_CAPTION
|
|
assert data["crawlCount"] == 2
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
assert data["modified"] >= modified
|
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
assert data["defaultThumbnailName"]
|
|
assert data["initialPages"]
|
|
assert data["pagesQueryUrl"].endswith(
|
|
f"/orgs/{default_org_id}/collections/{_coll_id}/pages"
|
|
)
|
|
assert "preloadResources" in data
|
|
|
|
resources = data["resources"]
|
|
assert resources
|
|
for resource in resources:
|
|
assert resource["name"]
|
|
assert resource["path"]
|
|
assert resource["size"]
|
|
|
|
|
|
def test_collection_public(crawler_auth_headers, default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
# make public and test replay headers
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"access": "public",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["initialPages"]
|
|
assert data["pagesQueryUrl"].endswith(
|
|
f"/orgs/{default_org_id}/collections/{_coll_id}/public/pages"
|
|
)
|
|
assert "preloadResources" in data
|
|
|
|
assert r.status_code == 200
|
|
assert r.headers["Access-Control-Allow-Origin"] == "*"
|
|
assert r.headers["Access-Control-Allow-Headers"] == "*"
|
|
|
|
# test public pages endpoint
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] > 0
|
|
assert data["items"]
|
|
|
|
# make unlisted and test replay headers
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"access": "unlisted",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.headers["Access-Control-Allow-Origin"] == "*"
|
|
assert r.headers["Access-Control-Allow-Headers"] == "*"
|
|
|
|
# make private again
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"access": "private",
|
|
},
|
|
)
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
|
|
def test_collection_access_invalid_value(crawler_auth_headers, default_org_id):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"access": "invalid",
|
|
},
|
|
)
|
|
assert r.status_code == 422
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["access"] == "private"
|
|
|
|
|
|
def test_add_upload_to_collection(crawler_auth_headers, default_org_id):
|
|
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
|
|
r = requests.put(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test-upload.wacz",
|
|
headers=crawler_auth_headers,
|
|
data=read_in_chunks(fh),
|
|
)
|
|
|
|
assert r.status_code == 200
|
|
assert r.json()["added"]
|
|
|
|
global upload_id
|
|
upload_id = r.json()["id"]
|
|
|
|
# Add upload
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
|
|
json={"crawlIds": [upload_id]},
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["id"] == _coll_id
|
|
assert data["crawlCount"] == 3
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
assert data["modified"]
|
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
assert data["defaultThumbnailName"]
|
|
|
|
# Verify it was added
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert _coll_id in r.json()["collectionIds"]
|
|
assert r.json()["collections"] == [{"name": UPDATED_NAME, "id": _coll_id}]
|
|
|
|
|
|
def test_download_streaming_collection(crawler_auth_headers, default_org_id):
|
|
# Add upload
|
|
with TemporaryFile() as fh:
|
|
with requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/download",
|
|
headers=crawler_auth_headers,
|
|
stream=True,
|
|
) as r:
|
|
assert r.status_code == 200
|
|
for chunk in r.iter_content():
|
|
fh.write(chunk)
|
|
|
|
fh.seek(0)
|
|
with ZipFile(fh, "r") as zip_file:
|
|
contents = zip_file.namelist()
|
|
|
|
assert len(contents) == 4
|
|
for filename in contents:
|
|
assert filename.endswith(".wacz") or filename == "datapackage.json"
|
|
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
|
|
|
|
|
|
def test_list_collections(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections", headers=crawler_auth_headers
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert len(items) == 3
|
|
|
|
first_coll = [coll for coll in items if coll["name"] == UPDATED_NAME][0]
|
|
assert first_coll["id"] == _coll_id
|
|
assert first_coll["name"] == UPDATED_NAME
|
|
assert first_coll["slug"] == UPDATED_SLUG
|
|
assert first_coll["oid"] == default_org_id
|
|
assert first_coll["description"] == DESCRIPTION
|
|
assert first_coll["caption"] == UPDATED_CAPTION
|
|
assert first_coll["crawlCount"] == 3
|
|
assert first_coll["pageCount"] > 0
|
|
assert first_coll["uniquePageCount"] > 0
|
|
assert first_coll["totalSize"] > 0
|
|
assert first_coll["modified"]
|
|
assert first_coll["tags"] == ["wr-test-2", "wr-test-1"]
|
|
assert first_coll["access"] == "private"
|
|
assert first_coll["dateEarliest"]
|
|
assert first_coll["dateLatest"]
|
|
assert first_coll["defaultThumbnailName"]
|
|
|
|
second_coll = [coll for coll in items if coll["name"] == SECOND_COLLECTION_NAME][0]
|
|
assert second_coll["id"]
|
|
assert second_coll["name"] == SECOND_COLLECTION_NAME
|
|
assert second_coll["slug"] == SECOND_COLLECTION_NAME
|
|
assert second_coll["oid"] == default_org_id
|
|
assert second_coll.get("description") is None
|
|
assert second_coll["crawlCount"] == 1
|
|
assert second_coll["pageCount"] > 0
|
|
assert second_coll["uniquePageCount"] > 0
|
|
assert second_coll["totalSize"] > 0
|
|
assert second_coll["modified"]
|
|
assert second_coll["tags"] == ["wr-test-2"]
|
|
assert second_coll["access"] == "private"
|
|
assert second_coll["dateEarliest"]
|
|
assert second_coll["dateLatest"]
|
|
|
|
|
|
def test_list_pages_in_collection(crawler_auth_headers, default_org_id):
|
|
# Test list endpoint
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["total"] >= 0
|
|
|
|
pages = data["items"]
|
|
assert pages
|
|
|
|
for page in pages:
|
|
assert page["id"]
|
|
assert page["oid"]
|
|
assert page["crawl_id"]
|
|
assert page["url"]
|
|
assert page["ts"]
|
|
assert page.get("title") or page.get("title") is None
|
|
assert page.get("loadState") or page.get("loadState") is None
|
|
assert page.get("status") or page.get("status") is None
|
|
assert page.get("mime") or page.get("mime") is None
|
|
assert page["isError"] in (None, True, False)
|
|
assert page["isFile"] in (None, True, False)
|
|
|
|
# Save info for page to test url and urlPrefix filters
|
|
coll_page = pages[0]
|
|
coll_page_id = coll_page["id"]
|
|
coll_page_url = coll_page["url"]
|
|
coll_page_ts = coll_page["ts"]
|
|
coll_page_title = coll_page["title"]
|
|
|
|
# Test search filter
|
|
partial_title = coll_page_title[:5]
|
|
partial_url = coll_page_url[:8]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["total"] >= 1
|
|
for matching_page in data["items"]:
|
|
assert (
|
|
partial_title in matching_page["title"]
|
|
or partial_url in matching_page["url"]
|
|
)
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_url}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["total"] >= 1
|
|
for matching_page in data["items"]:
|
|
assert (
|
|
partial_title in matching_page["title"]
|
|
or partial_url in matching_page["url"]
|
|
)
|
|
|
|
# Test exact url filter
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["total"] >= 1
|
|
for matching_page in data["items"]:
|
|
assert matching_page["url"] == coll_page_url
|
|
|
|
# Test exact url and ts filters together
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}&ts={coll_page_ts}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["total"] >= 1
|
|
for matching_page in data["items"]:
|
|
assert matching_page["url"] == coll_page_url
|
|
assert matching_page["ts"] == coll_page_ts
|
|
|
|
# Test urlPrefix filter
|
|
url_prefix = coll_page_url[:8]
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?urlPrefix={url_prefix}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["total"] >= 1
|
|
|
|
found_matching_page = False
|
|
for page in data["items"]:
|
|
if page["id"] == coll_page_id and page["url"] == coll_page_url:
|
|
found_matching_page = True
|
|
|
|
assert found_matching_page
|
|
|
|
# Test isSeed filter
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=true",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
for page in data["items"]:
|
|
assert page["isSeed"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=false",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
for page in data["items"]:
|
|
assert page["isSeed"] is False
|
|
|
|
# Test depth filter
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=0",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
for page in data["items"]:
|
|
assert page["depth"] == 0
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=1",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
for page in data["items"]:
|
|
assert page["depth"] == 1
|
|
|
|
|
|
def test_remove_upload_from_collection(crawler_auth_headers, default_org_id):
|
|
# Remove upload
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/remove",
|
|
json={"crawlIds": [upload_id]},
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["id"] == _coll_id
|
|
assert data["crawlCount"] == 2
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
assert data["modified"] >= modified
|
|
assert data.get("tags") == ["wr-test-2", "wr-test-1"]
|
|
|
|
# Verify it was removed
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert _coll_id not in r.json()["collectionIds"]
|
|
|
|
|
|
def test_filter_sort_collections(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
# Test filtering by name
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?name={SECOND_COLLECTION_NAME}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 1
|
|
|
|
items = data["items"]
|
|
assert len(items) == 1
|
|
|
|
coll = items[0]
|
|
assert coll["id"]
|
|
assert coll["name"] == SECOND_COLLECTION_NAME
|
|
assert coll["oid"] == default_org_id
|
|
assert coll.get("description") is None
|
|
|
|
# Test filtering by name prefix
|
|
name_prefix = SECOND_COLLECTION_NAME[0:4]
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?namePrefix={name_prefix}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 1
|
|
|
|
items = data["items"]
|
|
assert len(items) == 1
|
|
|
|
coll = items[0]
|
|
assert coll["id"]
|
|
assert coll["name"] == SECOND_COLLECTION_NAME
|
|
assert coll["oid"] == default_org_id
|
|
assert coll.get("description") is None
|
|
|
|
# Test filtering by name prefix (case insensitive)
|
|
name_prefix = name_prefix.upper()
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?namePrefix={name_prefix}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 1
|
|
|
|
items = data["items"]
|
|
assert len(items) == 1
|
|
|
|
coll = items[0]
|
|
assert coll["id"]
|
|
assert coll["name"] == SECOND_COLLECTION_NAME
|
|
assert coll["oid"] == default_org_id
|
|
assert coll.get("description") is None
|
|
|
|
# Test filtering by access
|
|
name_prefix = name_prefix.upper()
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?access=public",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 1
|
|
|
|
items = data["items"]
|
|
assert len(items) == 1
|
|
|
|
coll = items[0]
|
|
assert coll["id"]
|
|
assert coll["name"] == PUBLIC_COLLECTION_NAME
|
|
assert coll["oid"] == default_org_id
|
|
assert coll.get("description") is None
|
|
assert coll["access"] == "public"
|
|
|
|
# Test sorting by name, ascending (default)
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=name",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert items[0]["name"] == PUBLIC_COLLECTION_NAME
|
|
assert items[1]["name"] == SECOND_COLLECTION_NAME
|
|
assert items[2]["name"] == UPDATED_NAME
|
|
|
|
# Test sorting by name, descending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=name&sortDirection=-1",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert items[0]["name"] == UPDATED_NAME
|
|
assert items[1]["name"] == SECOND_COLLECTION_NAME
|
|
assert items[2]["name"] == PUBLIC_COLLECTION_NAME
|
|
|
|
# Test sorting by description, ascending (default)
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=description",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert (
|
|
items[0]["name"] == SECOND_COLLECTION_NAME
|
|
or items[0]["name"] == PUBLIC_COLLECTION_NAME
|
|
)
|
|
assert items[0].get("description") is None
|
|
assert (
|
|
items[1]["name"] == PUBLIC_COLLECTION_NAME
|
|
or items[1]["name"] == SECOND_COLLECTION_NAME
|
|
)
|
|
assert items[1]["name"] != items[0]["name"]
|
|
assert items[1].get("description") is None
|
|
assert items[2]["name"] == UPDATED_NAME
|
|
assert items[2]["description"] == DESCRIPTION
|
|
|
|
# Test sorting by description, descending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=description&sortDirection=-1",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert items[0]["name"] == UPDATED_NAME
|
|
assert items[0]["description"] == DESCRIPTION
|
|
assert (
|
|
items[1]["name"] == SECOND_COLLECTION_NAME
|
|
or items[1]["name"] == PUBLIC_COLLECTION_NAME
|
|
)
|
|
assert items[1].get("description") is None
|
|
assert (
|
|
items[2]["name"] == PUBLIC_COLLECTION_NAME
|
|
or items[2]["name"] == SECOND_COLLECTION_NAME
|
|
)
|
|
assert items[1]["name"] != items[2]["name"]
|
|
assert items[2].get("description") is None
|
|
|
|
# Test sorting by modified, ascending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=modified",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert items[0]["modified"] <= items[1]["modified"]
|
|
|
|
# Test sorting by modified, descending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=modified&sortDirection=-1",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert items[0]["modified"] >= items[1]["modified"]
|
|
|
|
# Test sorting by size, ascending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=totalSize",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert items[0]["totalSize"] <= items[1]["totalSize"]
|
|
|
|
# Test sorting by size, descending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=totalSize&sortDirection=-1",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert items[0]["totalSize"] >= items[1]["totalSize"]
|
|
|
|
# Invalid sort value
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=invalid",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_sort_by"
|
|
|
|
# Invalid sort_direction value
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=modified&sortDirection=0",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_sort_direction"
|
|
|
|
|
|
def test_list_public_collections(
|
|
crawler_auth_headers,
|
|
admin_auth_headers,
|
|
default_org_id,
|
|
non_default_org_id,
|
|
crawler_crawl_id,
|
|
admin_crawl_id,
|
|
):
|
|
# Create new public collection
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"crawlIds": [crawler_crawl_id],
|
|
"name": "Second public collection",
|
|
"slug": SECOND_PUBLIC_COLL_SLUG,
|
|
"description": "Lorem ipsum",
|
|
"access": "public",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
global _second_public_coll_id
|
|
_second_public_coll_id = r.json()["id"]
|
|
|
|
# Get default org slug
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
global default_org_slug
|
|
default_org_slug = data["slug"]
|
|
|
|
org_name = data["name"]
|
|
|
|
# Verify that public profile isn't enabled
|
|
assert data["enablePublicProfile"] is False
|
|
assert data["publicDescription"] == ""
|
|
assert data["publicUrl"] == ""
|
|
|
|
# Try listing public collections without org public profile enabled
|
|
r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections")
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "public_profile_not_found"
|
|
|
|
# Enable public profile on org
|
|
public_description = "This is a test public org!"
|
|
public_url = "https://example.com"
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"enablePublicProfile": True,
|
|
"publicDescription": public_description,
|
|
"publicUrl": public_url,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["enablePublicProfile"]
|
|
assert data["publicDescription"] == public_description
|
|
assert data["publicUrl"] == public_url
|
|
|
|
# List public collections with no auth (no public profile)
|
|
r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections")
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
org_data = data["org"]
|
|
assert org_data["name"] == org_name
|
|
assert org_data["description"] == public_description
|
|
assert org_data["url"] == public_url
|
|
|
|
collections = data["collections"]
|
|
assert len(collections) == 2
|
|
for collection in collections:
|
|
assert collection["id"] in (_public_coll_id, _second_public_coll_id)
|
|
assert collection["oid"]
|
|
assert collection["access"] == "public"
|
|
assert collection["name"]
|
|
assert collection["created"]
|
|
assert collection["modified"]
|
|
assert collection["slug"]
|
|
assert collection["dateEarliest"]
|
|
assert collection["dateLatest"]
|
|
assert collection["crawlCount"] > 0
|
|
assert collection["pageCount"] > 0
|
|
assert collection["uniquePageCount"] > 0
|
|
assert collection["totalSize"] > 0
|
|
|
|
# Test non-existing slug - it should return a 404 but not reveal
|
|
# whether or not an org exists with that slug
|
|
r = requests.get(f"{API_PREFIX}/public/orgs/nonexistentslug/collections")
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "public_profile_not_found"
|
|
|
|
|
|
def test_list_public_collections_no_colls(non_default_org_id, admin_auth_headers):
|
|
# Test existing org that's not public - should return same 404 as
|
|
# if org doesn't exist
|
|
r = requests.get(f"{API_PREFIX}/public/orgs/{NON_DEFAULT_ORG_SLUG}/collections")
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "public_profile_not_found"
|
|
|
|
# Enable public profile on org with zero public collections
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{non_default_org_id}/public-profile",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"enablePublicProfile": True,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# List public collections with no auth - should still get profile even
|
|
# with no public collections
|
|
r = requests.get(f"{API_PREFIX}/public/orgs/{NON_DEFAULT_ORG_SLUG}/collections")
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["org"]["name"] == NON_DEFAULT_ORG_NAME
|
|
assert data["collections"] == []
|
|
|
|
|
|
def test_set_collection_home_url(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id
|
|
):
|
|
# Get a page id from crawler_crawl_id
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] >= 1
|
|
|
|
page = data["items"][0]
|
|
assert page
|
|
|
|
page_id = page["id"]
|
|
assert page_id
|
|
|
|
page_url = page["url"]
|
|
page_ts = page["ts"]
|
|
|
|
# Set page as home url
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/home-url",
|
|
headers=crawler_auth_headers,
|
|
json={"pageId": page_id},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# Check that fields were set in collection as expected
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["homeUrl"] == page_url
|
|
assert data["homeUrlTs"] == page_ts
|
|
assert data["homeUrlPageId"] == page_id
|
|
|
|
|
|
def test_collection_url_list(crawler_auth_headers, default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/urls",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["total"] >= 1
|
|
urls = data["items"]
|
|
assert urls
|
|
|
|
for url in urls:
|
|
assert url["url"]
|
|
assert url["count"] >= 1
|
|
|
|
snapshots = url["snapshots"]
|
|
assert snapshots
|
|
|
|
for snapshot in snapshots:
|
|
assert snapshot["pageId"]
|
|
assert snapshot["ts"]
|
|
assert snapshot["status"]
|
|
|
|
|
|
def test_upload_collection_thumbnail(crawler_auth_headers, default_org_id):
|
|
# https://dev.browsertrix.com/api/orgs/c69247f4-415e-4abc-b449-e85d2f26c626/collections/b764fbe1-baab-4dc5-8dca-2db6f82c250b/thumbnail?filename=page-thumbnail_47fe599e-ed62-4edd-b078-93d4bf281e0f.jpeg&sourceUrl=https%3A%2F%2Fspecs.webrecorder.net%2F&sourceTs=2024-08-16T08%3A00%3A21.601000Z&sourcePageId=47fe599e-ed62-4edd-b078-93d4bf281e0f
|
|
with open(os.path.join(curr_dir, "data", "thumbnail.jpg"), "rb") as fh:
|
|
r = requests.put(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail?filename=thumbnail.jpg&sourceUrl=https%3A%2F%2Fexample.com%2F&sourceTs=2024-08-16T08%3A00%3A21.601000Z&sourcePageId=1bba4aba-d5be-4943-ad48-d6710633d754",
|
|
headers=crawler_auth_headers,
|
|
data=read_in_chunks(fh),
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["added"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
collection = r.json()
|
|
thumbnail = collection["thumbnail"]
|
|
|
|
assert thumbnail["name"]
|
|
assert thumbnail["path"]
|
|
assert thumbnail["hash"]
|
|
assert thumbnail["size"] > 0
|
|
|
|
assert thumbnail["originalFilename"] == "thumbnail.jpg"
|
|
assert thumbnail["mime"] == "image/jpeg"
|
|
assert thumbnail["userid"]
|
|
assert thumbnail["userName"]
|
|
assert thumbnail["created"]
|
|
|
|
thumbnailSource = collection["thumbnailSource"]
|
|
|
|
assert thumbnailSource["url"]
|
|
assert thumbnailSource["urlTs"]
|
|
assert thumbnailSource["urlPageId"]
|
|
|
|
assert thumbnailSource["url"] == "https://example.com/"
|
|
assert thumbnailSource["urlTs"] == "2024-08-16T08:00:21.601000Z"
|
|
assert thumbnailSource["urlPageId"] == "1bba4aba-d5be-4943-ad48-d6710633d754"
|
|
|
|
|
|
def test_set_collection_default_thumbnail(crawler_auth_headers, default_org_id):
|
|
default_thumbnail_name = "orange-default.avif"
|
|
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={"defaultThumbnailName": default_thumbnail_name},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == _second_public_coll_id
|
|
assert data["defaultThumbnailName"] == default_thumbnail_name
|
|
|
|
|
|
def test_list_public_colls_home_url_thumbnail():
|
|
# Check we get expected data for each public collection
|
|
# and nothing we don't expect
|
|
non_public_fields = (
|
|
"tags",
|
|
"homeUrlPageId",
|
|
)
|
|
non_public_image_fields = ("originalFilename", "userid", "userName", "created")
|
|
|
|
r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections")
|
|
assert r.status_code == 200
|
|
collections = r.json()["collections"]
|
|
assert len(collections) == 2
|
|
|
|
for coll in collections:
|
|
assert coll["id"] in (_public_coll_id, _second_public_coll_id)
|
|
assert coll["oid"]
|
|
assert coll["access"] == "public"
|
|
assert coll["name"]
|
|
assert coll["created"]
|
|
assert coll["modified"]
|
|
assert coll["resources"]
|
|
assert coll["dateEarliest"]
|
|
assert coll["dateLatest"]
|
|
assert coll["crawlCount"] > 0
|
|
assert coll["pageCount"] > 0
|
|
assert coll["uniquePageCount"] > 0
|
|
assert coll["totalSize"] > 0
|
|
|
|
for field in non_public_fields:
|
|
assert field not in coll
|
|
|
|
if coll["id"] == _public_coll_id:
|
|
assert coll["allowPublicDownload"] is False
|
|
|
|
assert coll["caption"] == CAPTION
|
|
|
|
assert coll["homeUrl"]
|
|
assert coll["homeUrlTs"]
|
|
|
|
thumbnail = coll["thumbnail"]
|
|
assert thumbnail
|
|
|
|
assert thumbnail["name"]
|
|
assert thumbnail["path"]
|
|
assert thumbnail["hash"]
|
|
assert thumbnail["size"]
|
|
assert thumbnail["mime"]
|
|
|
|
for field in non_public_image_fields:
|
|
assert field not in thumbnail
|
|
|
|
if coll["id"] == _second_public_coll_id:
|
|
assert coll["description"]
|
|
assert coll["defaultThumbnailName"] == "orange-default.avif"
|
|
assert coll["allowPublicDownload"]
|
|
|
|
|
|
def test_get_public_collection(default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}"
|
|
)
|
|
assert r.status_code == 200
|
|
coll = r.json()
|
|
|
|
assert coll["id"] == _public_coll_id
|
|
assert coll["oid"] == default_org_id
|
|
assert coll["access"] == "public"
|
|
assert coll["name"]
|
|
assert coll["created"]
|
|
assert coll["modified"]
|
|
assert coll["slug"] == PUBLIC_COLLECTION_SLUG
|
|
assert coll["resources"]
|
|
assert coll["dateEarliest"]
|
|
assert coll["dateLatest"]
|
|
assert coll["crawlCount"] > 0
|
|
assert coll["pageCount"] > 0
|
|
assert coll["uniquePageCount"] > 0
|
|
assert coll["totalSize"] > 0
|
|
|
|
for field in NON_PUBLIC_COLL_FIELDS:
|
|
assert field not in coll
|
|
|
|
assert coll["caption"] == CAPTION
|
|
|
|
assert coll["homeUrl"]
|
|
assert coll["homeUrlTs"]
|
|
|
|
assert coll["allowPublicDownload"] is False
|
|
|
|
thumbnail = coll["thumbnail"]
|
|
assert thumbnail
|
|
|
|
assert thumbnail["name"]
|
|
assert thumbnail["path"]
|
|
assert thumbnail["hash"]
|
|
assert thumbnail["size"]
|
|
assert thumbnail["mime"]
|
|
|
|
for field in NON_PUBLIC_IMAGE_FIELDS:
|
|
assert field not in thumbnail
|
|
|
|
# Invalid org slug - don't reveal whether org exists or not, use
|
|
# same exception as if collection doesn't exist
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/doesntexist/collections/{PUBLIC_COLLECTION_SLUG}"
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "collection_not_found"
|
|
|
|
# Unused slug
|
|
random_uuid = uuid4()
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/someslugnotinuse"
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "collection_not_found"
|
|
|
|
# Collection isn't public
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{UPDATED_SLUG}"
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "collection_not_found"
|
|
|
|
|
|
def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id):
|
|
# Make second public coll unlisted
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"access": "unlisted",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# Verify single public collection GET endpoint works for unlisted collection
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{SECOND_PUBLIC_COLL_SLUG}"
|
|
)
|
|
assert r.status_code == 200
|
|
coll = r.json()
|
|
|
|
assert coll["id"] == _second_public_coll_id
|
|
assert coll["oid"] == default_org_id
|
|
assert coll["access"] == "unlisted"
|
|
assert coll["name"]
|
|
assert coll["created"]
|
|
assert coll["modified"]
|
|
assert coll["slug"] == SECOND_PUBLIC_COLL_SLUG
|
|
assert coll["resources"]
|
|
assert coll["dateEarliest"]
|
|
assert coll["dateLatest"]
|
|
assert coll["crawlCount"] > 0
|
|
assert coll["pageCount"] > 0
|
|
assert coll["uniquePageCount"] > 0
|
|
assert coll["totalSize"] > 0
|
|
assert coll["defaultThumbnailName"] == "orange-default.avif"
|
|
assert coll["allowPublicDownload"]
|
|
|
|
for field in NON_PUBLIC_COLL_FIELDS:
|
|
assert field not in coll
|
|
|
|
|
|
def test_get_public_collection_unlisted_org_profile_disabled(
|
|
admin_auth_headers, default_org_id
|
|
):
|
|
# Disable org profile
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"enablePublicProfile": False,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# Verify we can still get public details for unlisted collection
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{SECOND_PUBLIC_COLL_SLUG}"
|
|
)
|
|
assert r.status_code == 200
|
|
coll = r.json()
|
|
|
|
assert coll["id"] == _second_public_coll_id
|
|
assert coll["oid"] == default_org_id
|
|
assert coll["access"] == "unlisted"
|
|
assert coll["name"]
|
|
assert coll["created"]
|
|
assert coll["modified"]
|
|
assert coll["slug"]
|
|
assert coll["resources"]
|
|
assert coll["dateEarliest"]
|
|
assert coll["dateLatest"]
|
|
assert coll["crawlCount"] > 0
|
|
assert coll["pageCount"] > 0
|
|
assert coll["uniquePageCount"] > 0
|
|
assert coll["totalSize"] > 0
|
|
assert coll["defaultThumbnailName"] == "orange-default.avif"
|
|
assert coll["allowPublicDownload"]
|
|
|
|
for field in NON_PUBLIC_COLL_FIELDS:
|
|
assert field not in coll
|
|
|
|
# Re-enable org profile
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"enablePublicProfile": True,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
|
|
def test_delete_thumbnail(crawler_auth_headers, default_org_id):
|
|
r = requests.delete(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["deleted"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json().get("thumbnail") is None
|
|
|
|
r = requests.delete(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}/thumbnail",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "thumbnail_not_found"
|
|
|
|
|
|
def test_unset_collection_home_url(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id
|
|
):
|
|
# Unset home url
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/home-url",
|
|
headers=crawler_auth_headers,
|
|
json={"pageId": None},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# Check that fields were set in collection as expected
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data.get("homeUrl") is None
|
|
assert data.get("homeUrlTs") is None
|
|
assert data.get("homeUrlPageId") is None
|
|
|
|
|
|
def test_download_streaming_public_collection(crawler_auth_headers, default_org_id):
|
|
# Check that download is blocked if allowPublicDownload is False
|
|
with requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}/download",
|
|
stream=True,
|
|
) as r:
|
|
assert r.status_code == 403
|
|
|
|
# Set allowPublicDownload to True and then check downloading works
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"allowPublicDownload": True,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
with TemporaryFile() as fh:
|
|
with requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}/download",
|
|
stream=True,
|
|
) as r:
|
|
assert r.status_code == 200
|
|
for chunk in r.iter_content():
|
|
fh.write(chunk)
|
|
|
|
fh.seek(0)
|
|
with ZipFile(fh, "r") as zip_file:
|
|
contents = zip_file.namelist()
|
|
|
|
assert len(contents) == 2
|
|
for filename in contents:
|
|
assert filename.endswith(".wacz") or filename == "datapackage.json"
|
|
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
|
|
|
|
|
|
def test_download_streaming_public_collection_profile_disabled(
|
|
admin_auth_headers, default_org_id
|
|
):
|
|
# Disable org public profile and ensure download still works for public collection
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"enablePublicProfile": False,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
with TemporaryFile() as fh:
|
|
with requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}/download",
|
|
stream=True,
|
|
) as r:
|
|
assert r.status_code == 200
|
|
for chunk in r.iter_content():
|
|
fh.write(chunk)
|
|
|
|
fh.seek(0)
|
|
with ZipFile(fh, "r") as zip_file:
|
|
contents = zip_file.namelist()
|
|
|
|
assert len(contents) == 2
|
|
for filename in contents:
|
|
assert filename.endswith(".wacz") or filename == "datapackage.json"
|
|
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
|
|
|
|
|
|
def test_get_public_collection_slug_redirect(admin_auth_headers, default_org_id):
|
|
# Update public collection slug
|
|
new_slug = "new-slug"
|
|
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"slug": new_slug,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# Get public collection from previous slug
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}"
|
|
)
|
|
assert r.status_code == 200
|
|
coll = r.json()
|
|
|
|
assert coll["id"] == _public_coll_id
|
|
assert coll["oid"] == default_org_id
|
|
assert coll["slug"] == new_slug
|
|
|
|
# Rename second public collection slug to now-unused PUBLIC_COLLECTION_SLUG
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"slug": PUBLIC_COLLECTION_SLUG,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# Delete second public collection
|
|
r = requests.delete(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["success"]
|
|
|
|
# Verify that trying to go to PUBLIC_COLLECTION_SLUG now 404s instead of taking
|
|
# us to the collection that had the slug before it was reassigned
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}"
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
|
|
def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
|
# Delete second collection
|
|
r = requests.delete(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["success"]
|
|
|
|
# Verify collection id was removed from crawl
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert _second_coll_id not in r.json()["collectionIds"]
|
|
|
|
# Make a new empty (no crawls) collection and delete it
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"name": "To delete",
|
|
"description": "Deleting a collection with no crawls should work.",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["added"]
|
|
coll_id = data["id"]
|
|
|
|
r = requests.delete(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["success"]
|