Fixes #2673 Changes in this PR: - Adds a new `file_uploads.py` module and corresponding `/files` API prefix with methods/endpoints for uploading, GETing, and deleting seed files (can be extended to other types of files moving forward) - Seed files are supported via `CrawlConfig.config.seedFileId` on POST and PATCH endpoints. This seedFileId is replaced by a presigned url when passed to the crawler by the operator - Seed files are read when first uploaded to calculate `firstSeed` and `seedCount` and store them in the database, and this is copied into the workflow and crawl documents when they are created. - Logic is added to store `firstSeed` and `seedCount` for other workflows as well, and a migration added to backfill data, to maintain consistency and fix some of the pymongo aggregations that previously assumed all workflows would have at least one `Seed` object in `CrawlConfig.seeds` - Seed file and thumbnail storage stats are added to org stats - Seed file and thumbnail uploads first check that the org's storage quota has not been exceeded and return a 400 if so - A cron background job (run weekly each Sunday at midnight by default, but configurable) is added to look for seed files at least x minutes old (1440 minutes, or 1 day, by default, but configurable) that are not in use in any workflows, and to delete them when they are found. The backend pods will ensure this k8s batch job exists when starting up and create it if it does not already exist. A database entry for each run of the job is created in the operator on job completion so that it'll appear in the `/jobs` API endpoints, but retrying of this type of regularly scheduled background job is not supported as we don't want to accidentally create multiple competing scheduled jobs. - Adds a `min_seed_file_crawler_image` value to the Helm chart that is checked before creating a crawl from a workflow if set. If a workflow cannot be run, return the detail of the exception in `CrawlConfigAddedResponse.errorDetail` so that we can display the reason in the frontend - Add SeedFile model from base UserFile (former ImageFIle), ensure all APIs returning uploaded files return an absolute pre-signed URL (either with external origin or internal service origin) --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
1758 lines
55 KiB
Python
1758 lines
55 KiB
Python
import requests
|
|
import os
|
|
from uuid import uuid4
|
|
|
|
from zipfile import ZipFile, ZIP_STORED
|
|
from tempfile import TemporaryFile
|
|
|
|
from .conftest import API_PREFIX, NON_DEFAULT_ORG_NAME, NON_DEFAULT_ORG_SLUG
|
|
from .utils import read_in_chunks
|
|
|
|
COLLECTION_NAME = "Test collection"
|
|
COLLECTION_SLUG = "test-collection"
|
|
PUBLIC_COLLECTION_NAME = "Public Test collection"
|
|
PUBLIC_COLLECTION_SLUG = "custom-public-collection-slug"
|
|
UPDATED_NAME = "Updated tést cöllection"
|
|
UPDATED_SLUG = "updated-test-collection"
|
|
SECOND_COLLECTION_NAME = "second-collection"
|
|
DESCRIPTION = "Test description"
|
|
CAPTION = "Short caption"
|
|
UPDATED_CAPTION = "Updated caption"
|
|
SECOND_PUBLIC_COLL_SLUG = "second-public-collection"
|
|
|
|
NON_PUBLIC_COLL_FIELDS = (
|
|
"tags",
|
|
"homeUrlPageId",
|
|
)
|
|
NON_PUBLIC_IMAGE_FIELDS = ("originalFilename", "userid", "userName", "created")
|
|
|
|
|
|
_coll_id = None
|
|
_second_coll_id = None
|
|
_public_coll_id = None
|
|
_second_public_coll_id = None
|
|
upload_id = None
|
|
modified = None
|
|
default_org_slug = None
|
|
|
|
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
|
|
def test_create_collection(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
default_thumbnail_name = "default-thumbnail.jpg"
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"crawlIds": [crawler_crawl_id],
|
|
"name": COLLECTION_NAME,
|
|
"caption": CAPTION,
|
|
"defaultThumbnailName": default_thumbnail_name,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["added"]
|
|
assert data["name"] == COLLECTION_NAME
|
|
|
|
global _coll_id
|
|
_coll_id = data["id"]
|
|
|
|
# Verify crawl in collection
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert _coll_id in r.json()["collectionIds"]
|
|
assert r.json()["collections"] == [{"name": COLLECTION_NAME, "id": _coll_id}]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == _coll_id
|
|
assert data["name"] == COLLECTION_NAME
|
|
assert data["slug"] == COLLECTION_SLUG
|
|
assert data["caption"] == CAPTION
|
|
assert data["crawlCount"] == 1
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
modified = data["modified"]
|
|
assert modified
|
|
assert modified.endswith("Z")
|
|
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
|
|
assert data["defaultThumbnailName"] == default_thumbnail_name
|
|
assert data["allowPublicDownload"]
|
|
|
|
assert data["topPageHosts"] == [{"count": 3, "host": "webrecorder.net"}]
|
|
|
|
|
|
def test_create_public_collection(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"crawlIds": [crawler_crawl_id],
|
|
"name": PUBLIC_COLLECTION_NAME,
|
|
"slug": PUBLIC_COLLECTION_SLUG,
|
|
"caption": CAPTION,
|
|
"access": "public",
|
|
"allowPublicDownload": False,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["added"]
|
|
assert data["name"] == PUBLIC_COLLECTION_NAME
|
|
|
|
global _public_coll_id
|
|
_public_coll_id = data["id"]
|
|
|
|
# Verify that it is public
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.json()["access"] == "public"
|
|
|
|
|
|
def test_create_collection_taken_name(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"crawlIds": [crawler_crawl_id],
|
|
"name": COLLECTION_NAME,
|
|
},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] in ("collection_name_taken", "collection_slug_taken")
|
|
|
|
|
|
def test_create_collection_empty_name(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"crawlIds": [crawler_crawl_id],
|
|
"name": "",
|
|
},
|
|
)
|
|
assert r.status_code == 422
|
|
|
|
|
|
def test_create_empty_collection(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"name": "Empty Collection",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
coll_id = r.json()["id"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{coll_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["crawlCount"] == 0
|
|
assert data["pageCount"] == 0
|
|
assert len(data["resources"]) == 0
|
|
|
|
# Delete the empty collection
|
|
r = requests.delete(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["success"]
|
|
|
|
|
|
def test_update_collection(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"description": DESCRIPTION,
|
|
"caption": UPDATED_CAPTION,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == _coll_id
|
|
assert data["name"] == COLLECTION_NAME
|
|
assert data["description"] == DESCRIPTION
|
|
assert data["caption"] == UPDATED_CAPTION
|
|
assert data["crawlCount"] == 1
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
global modified
|
|
modified = data["modified"]
|
|
assert modified
|
|
assert modified.endswith("Z")
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
assert data["defaultThumbnailName"]
|
|
assert data["topPageHosts"]
|
|
|
|
|
|
def test_rename_collection(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"name": UPDATED_NAME,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == _coll_id
|
|
assert data["name"] == UPDATED_NAME
|
|
assert data["slug"] == UPDATED_SLUG
|
|
assert data["modified"] >= modified
|
|
|
|
|
|
def test_rename_collection_taken_name(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
# Add second collection
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"crawlIds": [crawler_crawl_id],
|
|
"name": SECOND_COLLECTION_NAME,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["added"]
|
|
assert data["name"] == SECOND_COLLECTION_NAME
|
|
|
|
global _second_coll_id
|
|
_second_coll_id = data["id"]
|
|
|
|
# Try to rename first coll to second collection's name
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={"name": SECOND_COLLECTION_NAME},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] in ("collection_name_taken", "collection_slug_taken")
|
|
|
|
# Try to set first coll's slug to value already used for second collection
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={"slug": SECOND_COLLECTION_NAME},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "collection_slug_taken"
|
|
|
|
|
|
def test_add_remove_crawl_from_collection(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
# Add crawl
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
|
|
json={"crawlIds": [admin_crawl_id]},
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["id"] == _coll_id
|
|
assert data["crawlCount"] == 2
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
assert data["modified"] >= modified
|
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
assert data["topPageHosts"] == [{"count": 7, "host": "webrecorder.net"}]
|
|
|
|
# Verify it was added
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert _coll_id in r.json()["collectionIds"]
|
|
|
|
# Remove crawls
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/remove",
|
|
json={"crawlIds": [admin_crawl_id, crawler_crawl_id]},
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["id"] == _coll_id
|
|
assert data["crawlCount"] == 0
|
|
assert data["pageCount"] == 0
|
|
assert data["uniquePageCount"] == 0
|
|
assert data["totalSize"] == 0
|
|
assert data["modified"] >= modified
|
|
assert data.get("tags", []) == []
|
|
assert data.get("dateEarliest") is None
|
|
assert data.get("dateLatest") is None
|
|
assert data["topPageHosts"] == []
|
|
|
|
# Verify they were removed
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert _coll_id not in r.json()["collectionIds"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert _coll_id not in r.json()["collectionIds"]
|
|
|
|
# Add crawls back for further tests
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
|
|
json={"crawlIds": [admin_crawl_id, crawler_crawl_id]},
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["id"] == _coll_id
|
|
assert data["crawlCount"] == 2
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
assert data["modified"] >= modified
|
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
assert data["topPageHosts"]
|
|
|
|
|
|
def test_get_collection(crawler_auth_headers, default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["id"] == _coll_id
|
|
assert data["name"] == UPDATED_NAME
|
|
assert data["slug"] == UPDATED_SLUG
|
|
assert data["oid"] == default_org_id
|
|
assert data["description"] == DESCRIPTION
|
|
assert data["caption"] == UPDATED_CAPTION
|
|
assert data["crawlCount"] == 2
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
assert data["modified"] >= modified
|
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
assert data["defaultThumbnailName"]
|
|
|
|
|
|
def test_get_collection_replay(crawler_auth_headers, default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/replay.json",
|
|
headers={"host": "custom-domain.example.com", **crawler_auth_headers},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["id"] == _coll_id
|
|
assert data["name"] == UPDATED_NAME
|
|
assert data["slug"] == UPDATED_SLUG
|
|
assert data["oid"] == default_org_id
|
|
assert data["description"] == DESCRIPTION
|
|
assert data["caption"] == UPDATED_CAPTION
|
|
assert data["crawlCount"] == 2
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
assert data["modified"] >= modified
|
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
assert data["defaultThumbnailName"]
|
|
assert data["initialPages"]
|
|
assert (
|
|
data["pagesQueryUrl"]
|
|
== f"http://custom-domain.example.com/api/orgs/{default_org_id}/collections/{_coll_id}/pages"
|
|
)
|
|
assert data["downloadUrl"] is None
|
|
assert "preloadResources" in data
|
|
|
|
resources = data["resources"]
|
|
assert resources
|
|
for resource in resources:
|
|
assert resource["name"]
|
|
assert resource["path"]
|
|
assert resource["size"]
|
|
|
|
|
|
def test_collection_public(crawler_auth_headers, default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
# make public and test replay headers
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"access": "public",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
|
|
headers={"host": "custom-domain.example.com", **crawler_auth_headers},
|
|
)
|
|
data = r.json()
|
|
assert data["initialPages"]
|
|
assert (
|
|
data["pagesQueryUrl"]
|
|
== f"http://custom-domain.example.com/api/orgs/{default_org_id}/collections/{_coll_id}/public/pages"
|
|
)
|
|
assert data["downloadUrl"] is not None
|
|
assert "preloadResources" in data
|
|
|
|
assert r.status_code == 200
|
|
assert r.headers["Access-Control-Allow-Origin"] == "*"
|
|
assert r.headers["Access-Control-Allow-Headers"] == "*"
|
|
|
|
# test public pages endpoint
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert len(data["items"]) > 0
|
|
|
|
# make unlisted and test replay headers
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"access": "unlisted",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.headers["Access-Control-Allow-Origin"] == "*"
|
|
assert r.headers["Access-Control-Allow-Headers"] == "*"
|
|
|
|
|
|
def test_collection_wrong_org(admin_auth_headers, non_default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{non_default_org_id}/collections/{_coll_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{non_default_org_id}/collections/{_coll_id}/public/replay.json",
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
|
|
def test_collection_public_make_private(crawler_auth_headers, default_org_id):
|
|
# make private again
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"access": "private",
|
|
},
|
|
)
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
|
|
def test_collection_access_invalid_value(crawler_auth_headers, default_org_id):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"access": "invalid",
|
|
},
|
|
)
|
|
assert r.status_code == 422
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["access"] == "private"
|
|
|
|
|
|
def test_add_upload_to_collection(crawler_auth_headers, default_org_id):
|
|
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
|
|
r = requests.put(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test-upload.wacz",
|
|
headers=crawler_auth_headers,
|
|
data=read_in_chunks(fh),
|
|
)
|
|
|
|
assert r.status_code == 200
|
|
assert r.json()["added"]
|
|
|
|
global upload_id
|
|
upload_id = r.json()["id"]
|
|
|
|
# Add upload
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
|
|
json={"crawlIds": [upload_id]},
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["id"] == _coll_id
|
|
assert data["crawlCount"] == 3
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
assert data["modified"]
|
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
|
assert data["dateEarliest"]
|
|
assert data["dateLatest"]
|
|
assert data["defaultThumbnailName"]
|
|
|
|
# Verify it was added
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert _coll_id in r.json()["collectionIds"]
|
|
assert r.json()["collections"] == [{"name": UPDATED_NAME, "id": _coll_id}]
|
|
|
|
|
|
def test_download_streaming_collection(crawler_auth_headers, default_org_id):
|
|
# Add upload
|
|
with TemporaryFile() as fh:
|
|
with requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/download",
|
|
headers=crawler_auth_headers,
|
|
stream=True,
|
|
) as r:
|
|
assert r.status_code == 200
|
|
for chunk in r.iter_content():
|
|
fh.write(chunk)
|
|
|
|
fh.seek(0)
|
|
with ZipFile(fh, "r") as zip_file:
|
|
contents = zip_file.namelist()
|
|
|
|
assert len(contents) == 4
|
|
for filename in contents:
|
|
assert filename.endswith(".wacz") or filename == "datapackage.json"
|
|
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
|
|
|
|
|
|
def test_list_collections(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections", headers=crawler_auth_headers
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert len(items) == 3
|
|
|
|
first_coll = [coll for coll in items if coll["name"] == UPDATED_NAME][0]
|
|
assert first_coll["id"] == _coll_id
|
|
assert first_coll["name"] == UPDATED_NAME
|
|
assert first_coll["slug"] == UPDATED_SLUG
|
|
assert first_coll["oid"] == default_org_id
|
|
assert first_coll["description"] == DESCRIPTION
|
|
assert first_coll["caption"] == UPDATED_CAPTION
|
|
assert first_coll["crawlCount"] == 3
|
|
assert first_coll["pageCount"] > 0
|
|
assert first_coll["uniquePageCount"] > 0
|
|
assert first_coll["totalSize"] > 0
|
|
assert first_coll["modified"]
|
|
assert first_coll["tags"] == ["wr-test-2", "wr-test-1"]
|
|
assert first_coll["access"] == "private"
|
|
assert first_coll["dateEarliest"]
|
|
assert first_coll["dateLatest"]
|
|
assert first_coll["defaultThumbnailName"]
|
|
|
|
second_coll = [coll for coll in items if coll["name"] == SECOND_COLLECTION_NAME][0]
|
|
assert second_coll["id"]
|
|
assert second_coll["name"] == SECOND_COLLECTION_NAME
|
|
assert second_coll["slug"] == SECOND_COLLECTION_NAME
|
|
assert second_coll["oid"] == default_org_id
|
|
assert second_coll.get("description") is None
|
|
assert second_coll["crawlCount"] == 1
|
|
assert second_coll["pageCount"] > 0
|
|
assert second_coll["uniquePageCount"] > 0
|
|
assert second_coll["totalSize"] > 0
|
|
assert second_coll["modified"]
|
|
assert second_coll["tags"] == ["wr-test-2"]
|
|
assert second_coll["access"] == "private"
|
|
assert second_coll["dateEarliest"]
|
|
assert second_coll["dateLatest"]
|
|
|
|
|
|
def test_list_pages_in_collection(crawler_auth_headers, default_org_id):
|
|
# Test list endpoint
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert len(data["items"]) >= 0
|
|
|
|
pages = data["items"]
|
|
assert pages
|
|
|
|
for page in pages:
|
|
assert page["id"]
|
|
assert page["oid"]
|
|
assert page["crawl_id"]
|
|
assert page["url"]
|
|
assert page["ts"]
|
|
assert page.get("title") or page.get("title") is None
|
|
assert page.get("loadState") or page.get("loadState") is None
|
|
assert page.get("status") or page.get("status") is None
|
|
assert page.get("mime") or page.get("mime") is None
|
|
assert page["isError"] in (None, True, False)
|
|
assert page["isFile"] in (None, True, False)
|
|
|
|
# Save info for page to test url and urlPrefix filters
|
|
coll_page = pages[0]
|
|
coll_page_id = coll_page["id"]
|
|
coll_page_url = coll_page["url"]
|
|
coll_page_ts = coll_page["ts"]
|
|
coll_page_title = coll_page["title"]
|
|
|
|
# Test search filter, make sure text search isn't case sensitive
|
|
partial_title = "Archiving"
|
|
partial_title_lower = partial_title.lower()
|
|
partial_url = coll_page_url[:8]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title_lower}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert len(data["items"]) >= 1
|
|
for matching_page in data["items"]:
|
|
assert (
|
|
partial_title in matching_page["title"]
|
|
or partial_url in matching_page["url"]
|
|
)
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_url}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert len(data["items"]) >= 1
|
|
for matching_page in data["items"]:
|
|
assert (
|
|
partial_title in matching_page["title"]
|
|
or partial_url in matching_page["url"]
|
|
)
|
|
|
|
# Test exact url filter
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert len(data["items"]) >= 1
|
|
for matching_page in data["items"]:
|
|
assert matching_page["url"] == coll_page_url
|
|
|
|
# Test exact url and ts filters together
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}&ts={coll_page_ts}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert len(data["items"]) >= 1
|
|
for matching_page in data["items"]:
|
|
assert matching_page["url"] == coll_page_url
|
|
assert matching_page["ts"] == coll_page_ts
|
|
|
|
# Test urlPrefix filter
|
|
url_prefix = coll_page_url[:8]
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?urlPrefix={url_prefix}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert len(data["items"]) >= 1
|
|
|
|
found_matching_page = False
|
|
for page in data["items"]:
|
|
if page["id"] == coll_page_id and page["url"] == coll_page_url:
|
|
found_matching_page = True
|
|
|
|
assert found_matching_page
|
|
|
|
# Test isSeed filter
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=true",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
for page in data["items"]:
|
|
assert page["isSeed"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=false",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
for page in data["items"]:
|
|
assert page["isSeed"] is False
|
|
|
|
# Test depth filter
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=0",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
for page in data["items"]:
|
|
assert page["depth"] == 0
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=1",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
for page in data["items"]:
|
|
assert page["depth"] == 1
|
|
|
|
|
|
def test_remove_upload_from_collection(crawler_auth_headers, default_org_id):
|
|
# Remove upload
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/remove",
|
|
json={"crawlIds": [upload_id]},
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["id"] == _coll_id
|
|
assert data["crawlCount"] == 2
|
|
assert data["pageCount"] > 0
|
|
assert data["uniquePageCount"] > 0
|
|
assert data["totalSize"] > 0
|
|
assert data["modified"] >= modified
|
|
assert data.get("tags") == ["wr-test-2", "wr-test-1"]
|
|
|
|
# Verify it was removed
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert _coll_id not in r.json()["collectionIds"]
|
|
|
|
|
|
def test_filter_sort_collections(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
|
):
|
|
# Test filtering by name
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?name={SECOND_COLLECTION_NAME}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 1
|
|
|
|
items = data["items"]
|
|
assert len(items) == 1
|
|
|
|
coll = items[0]
|
|
assert coll["id"]
|
|
assert coll["name"] == SECOND_COLLECTION_NAME
|
|
assert coll["oid"] == default_org_id
|
|
assert coll.get("description") is None
|
|
|
|
# Test filtering by name prefix
|
|
name_prefix = SECOND_COLLECTION_NAME[0:4]
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?namePrefix={name_prefix}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 1
|
|
|
|
items = data["items"]
|
|
assert len(items) == 1
|
|
|
|
coll = items[0]
|
|
assert coll["id"]
|
|
assert coll["name"] == SECOND_COLLECTION_NAME
|
|
assert coll["oid"] == default_org_id
|
|
assert coll.get("description") is None
|
|
|
|
# Test filtering by name prefix (case insensitive)
|
|
name_prefix = name_prefix.upper()
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?namePrefix={name_prefix}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 1
|
|
|
|
items = data["items"]
|
|
assert len(items) == 1
|
|
|
|
coll = items[0]
|
|
assert coll["id"]
|
|
assert coll["name"] == SECOND_COLLECTION_NAME
|
|
assert coll["oid"] == default_org_id
|
|
assert coll.get("description") is None
|
|
|
|
# Test filtering by access
|
|
name_prefix = name_prefix.upper()
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?access=public",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 1
|
|
|
|
items = data["items"]
|
|
assert len(items) == 1
|
|
|
|
coll = items[0]
|
|
assert coll["id"]
|
|
assert coll["name"] == PUBLIC_COLLECTION_NAME
|
|
assert coll["oid"] == default_org_id
|
|
assert coll.get("description") is None
|
|
assert coll["access"] == "public"
|
|
|
|
# Test sorting by name, ascending (default)
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=name",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert items[0]["name"] == PUBLIC_COLLECTION_NAME
|
|
assert items[1]["name"] == SECOND_COLLECTION_NAME
|
|
assert items[2]["name"] == UPDATED_NAME
|
|
|
|
# Test sorting by name, descending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=name&sortDirection=-1",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert items[0]["name"] == UPDATED_NAME
|
|
assert items[1]["name"] == SECOND_COLLECTION_NAME
|
|
assert items[2]["name"] == PUBLIC_COLLECTION_NAME
|
|
|
|
# Test sorting by description, ascending (default)
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=description",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert (
|
|
items[0]["name"] == SECOND_COLLECTION_NAME
|
|
or items[0]["name"] == PUBLIC_COLLECTION_NAME
|
|
)
|
|
assert items[0].get("description") is None
|
|
assert (
|
|
items[1]["name"] == PUBLIC_COLLECTION_NAME
|
|
or items[1]["name"] == SECOND_COLLECTION_NAME
|
|
)
|
|
assert items[1]["name"] != items[0]["name"]
|
|
assert items[1].get("description") is None
|
|
assert items[2]["name"] == UPDATED_NAME
|
|
assert items[2]["description"] == DESCRIPTION
|
|
|
|
# Test sorting by description, descending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=description&sortDirection=-1",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert items[0]["name"] == UPDATED_NAME
|
|
assert items[0]["description"] == DESCRIPTION
|
|
assert (
|
|
items[1]["name"] == SECOND_COLLECTION_NAME
|
|
or items[1]["name"] == PUBLIC_COLLECTION_NAME
|
|
)
|
|
assert items[1].get("description") is None
|
|
assert (
|
|
items[2]["name"] == PUBLIC_COLLECTION_NAME
|
|
or items[2]["name"] == SECOND_COLLECTION_NAME
|
|
)
|
|
assert items[1]["name"] != items[2]["name"]
|
|
assert items[2].get("description") is None
|
|
|
|
# Test sorting by modified, ascending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=modified",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert items[0]["modified"] <= items[1]["modified"]
|
|
|
|
# Test sorting by modified, descending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=modified&sortDirection=-1",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert items[0]["modified"] >= items[1]["modified"]
|
|
|
|
# Test sorting by size, ascending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=totalSize",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert items[0]["totalSize"] <= items[1]["totalSize"]
|
|
|
|
# Test sorting by size, descending
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=totalSize&sortDirection=-1",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
items = data["items"]
|
|
assert items[0]["totalSize"] >= items[1]["totalSize"]
|
|
|
|
# Invalid sort value
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=invalid",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_sort_by"
|
|
|
|
# Invalid sort_direction value
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=modified&sortDirection=0",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_sort_direction"
|
|
|
|
|
|
def test_list_public_collections(
|
|
crawler_auth_headers,
|
|
admin_auth_headers,
|
|
default_org_id,
|
|
non_default_org_id,
|
|
crawler_crawl_id,
|
|
admin_crawl_id,
|
|
):
|
|
# Create new public collection
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"crawlIds": [crawler_crawl_id],
|
|
"name": "Second public collection",
|
|
"slug": SECOND_PUBLIC_COLL_SLUG,
|
|
"description": "Lorem ipsum",
|
|
"access": "public",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
global _second_public_coll_id
|
|
_second_public_coll_id = r.json()["id"]
|
|
|
|
# Get default org slug
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
global default_org_slug
|
|
default_org_slug = data["slug"]
|
|
|
|
org_name = data["name"]
|
|
|
|
# Verify that public profile isn't enabled
|
|
assert data["enablePublicProfile"] is False
|
|
assert data["publicDescription"] == ""
|
|
assert data["publicUrl"] == ""
|
|
|
|
# Try listing public collections without org public profile enabled
|
|
r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections")
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "public_profile_not_found"
|
|
|
|
# Enable public profile on org
|
|
public_description = "This is a test public org!"
|
|
public_url = "https://example.com"
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"enablePublicProfile": True,
|
|
"publicDescription": public_description,
|
|
"publicUrl": public_url,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["enablePublicProfile"]
|
|
assert data["publicDescription"] == public_description
|
|
assert data["publicUrl"] == public_url
|
|
|
|
# List public collections with no auth (no public profile)
|
|
r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections")
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
org_data = data["org"]
|
|
assert org_data["name"] == org_name
|
|
assert org_data["description"] == public_description
|
|
assert org_data["url"] == public_url
|
|
|
|
collections = data["collections"]
|
|
assert len(collections) == 2
|
|
for collection in collections:
|
|
assert collection["id"] in (_public_coll_id, _second_public_coll_id)
|
|
assert collection["oid"]
|
|
assert collection["access"] == "public"
|
|
assert collection["name"]
|
|
assert collection["created"]
|
|
assert collection["modified"]
|
|
assert collection["slug"]
|
|
assert collection["dateEarliest"]
|
|
assert collection["dateLatest"]
|
|
assert collection["crawlCount"] > 0
|
|
assert collection["pageCount"] > 0
|
|
assert collection["uniquePageCount"] > 0
|
|
assert collection["totalSize"] > 0
|
|
assert collection["topPageHosts"]
|
|
|
|
# Test non-existing slug - it should return a 404 but not reveal
|
|
# whether or not an org exists with that slug
|
|
r = requests.get(f"{API_PREFIX}/public/orgs/nonexistentslug/collections")
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "public_profile_not_found"
|
|
|
|
|
|
def test_list_public_collections_no_colls(non_default_org_id, admin_auth_headers):
|
|
# Test existing org that's not public - should return same 404 as
|
|
# if org doesn't exist
|
|
r = requests.get(f"{API_PREFIX}/public/orgs/{NON_DEFAULT_ORG_SLUG}/collections")
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "public_profile_not_found"
|
|
|
|
# Enable public profile on org with zero public collections
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{non_default_org_id}/public-profile",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"enablePublicProfile": True,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# List public collections with no auth - should still get profile even
|
|
# with no public collections
|
|
r = requests.get(f"{API_PREFIX}/public/orgs/{NON_DEFAULT_ORG_SLUG}/collections")
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["org"]["name"] == NON_DEFAULT_ORG_NAME
|
|
assert data["collections"] == []
|
|
|
|
|
|
def test_set_collection_home_url(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id
|
|
):
|
|
# Get a page id from crawler_crawl_id
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] >= 1
|
|
|
|
page = data["items"][0]
|
|
assert page
|
|
|
|
page_id = page["id"]
|
|
assert page_id
|
|
|
|
page_url = page["url"]
|
|
page_ts = page["ts"]
|
|
|
|
# Set page as home url
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/home-url",
|
|
headers=crawler_auth_headers,
|
|
json={"pageId": page_id},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# Check that fields were set in collection as expected
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["homeUrl"] == page_url
|
|
assert data["homeUrlTs"] == page_ts
|
|
assert data["homeUrlPageId"] == page_id
|
|
|
|
|
|
def test_collection_url_list(crawler_auth_headers, default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/pageUrlCounts",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert len(data["items"]) >= 1
|
|
urls = data["items"]
|
|
assert urls
|
|
|
|
for url in urls:
|
|
assert url["url"]
|
|
assert url["count"] >= 1
|
|
|
|
snapshots = url["snapshots"]
|
|
assert snapshots
|
|
|
|
for snapshot in snapshots:
|
|
assert snapshot["pageId"]
|
|
assert snapshot["ts"]
|
|
assert snapshot["status"]
|
|
|
|
|
|
def test_upload_collection_thumbnail(crawler_auth_headers, default_org_id):
|
|
# https://dev.browsertrix.com/api/orgs/c69247f4-415e-4abc-b449-e85d2f26c626/collections/b764fbe1-baab-4dc5-8dca-2db6f82c250b/thumbnail?filename=page-thumbnail_47fe599e-ed62-4edd-b078-93d4bf281e0f.jpeg&sourceUrl=https%3A%2F%2Fspecs.webrecorder.net%2F&sourceTs=2024-08-16T08%3A00%3A21.601000Z&sourcePageId=47fe599e-ed62-4edd-b078-93d4bf281e0f
|
|
with open(os.path.join(curr_dir, "data", "thumbnail.jpg"), "rb") as fh:
|
|
r = requests.put(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail?filename=thumbnail.jpg&sourceUrl=https%3A%2F%2Fexample.com%2F&sourceTs=2024-08-16T08%3A00%3A21.601000Z&sourcePageId=1bba4aba-d5be-4943-ad48-d6710633d754",
|
|
headers=crawler_auth_headers,
|
|
data=read_in_chunks(fh),
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["added"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
|
|
headers={"Host": "localhost:30870", **crawler_auth_headers},
|
|
)
|
|
assert r.status_code == 200
|
|
collection = r.json()
|
|
thumbnail = collection["thumbnail"]
|
|
|
|
assert thumbnail["name"]
|
|
assert thumbnail["path"].startswith("http://localhost:30870/data/")
|
|
assert thumbnail["hash"]
|
|
assert thumbnail["size"] > 0
|
|
|
|
assert thumbnail["originalFilename"] == "thumbnail.jpg"
|
|
assert thumbnail["mime"] == "image/jpeg"
|
|
assert thumbnail["userid"]
|
|
assert thumbnail["userName"]
|
|
assert thumbnail["created"]
|
|
|
|
thumbnailSource = collection["thumbnailSource"]
|
|
|
|
assert thumbnailSource["url"]
|
|
assert thumbnailSource["urlTs"]
|
|
assert thumbnailSource["urlPageId"]
|
|
|
|
assert thumbnailSource["url"] == "https://example.com/"
|
|
assert thumbnailSource["urlTs"] == "2024-08-16T08:00:21.601000Z"
|
|
assert thumbnailSource["urlPageId"] == "1bba4aba-d5be-4943-ad48-d6710633d754"
|
|
|
|
|
|
def test_set_collection_default_thumbnail(crawler_auth_headers, default_org_id):
|
|
default_thumbnail_name = "orange-default.avif"
|
|
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={"defaultThumbnailName": default_thumbnail_name},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == _second_public_coll_id
|
|
assert data["defaultThumbnailName"] == default_thumbnail_name
|
|
|
|
|
|
def test_list_public_colls_home_url_thumbnail():
|
|
# Check we get expected data for each public collection
|
|
# and nothing we don't expect
|
|
non_public_fields = (
|
|
"tags",
|
|
"homeUrlPageId",
|
|
)
|
|
non_public_image_fields = ("originalFilename", "userid", "userName", "created")
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections",
|
|
headers={"Host": "localhost:30870"},
|
|
)
|
|
|
|
assert r.status_code == 200
|
|
collections = r.json()["collections"]
|
|
assert len(collections) == 2
|
|
|
|
for coll in collections:
|
|
assert coll["id"] in (_public_coll_id, _second_public_coll_id)
|
|
assert coll["oid"]
|
|
assert coll["access"] == "public"
|
|
assert coll["name"]
|
|
assert coll["created"]
|
|
assert coll["modified"]
|
|
assert coll["dateEarliest"]
|
|
assert coll["dateLatest"]
|
|
assert coll["crawlCount"] > 0
|
|
assert coll["pageCount"] > 0
|
|
assert coll["uniquePageCount"] > 0
|
|
assert coll["totalSize"] > 0
|
|
|
|
for field in non_public_fields:
|
|
assert field not in coll
|
|
|
|
if coll["id"] == _public_coll_id:
|
|
assert coll["allowPublicDownload"] is False
|
|
|
|
assert coll["caption"] == CAPTION
|
|
|
|
assert coll["homeUrl"]
|
|
assert coll["homeUrlTs"]
|
|
|
|
thumbnail = coll["thumbnail"]
|
|
assert thumbnail
|
|
|
|
assert thumbnail["name"]
|
|
assert thumbnail["path"].startswith("http://localhost:30870/data/")
|
|
assert thumbnail["hash"]
|
|
assert thumbnail["size"]
|
|
assert thumbnail["mime"]
|
|
|
|
for field in non_public_image_fields:
|
|
assert field not in thumbnail
|
|
|
|
if coll["id"] == _second_public_coll_id:
|
|
assert coll["description"]
|
|
assert coll["defaultThumbnailName"] == "orange-default.avif"
|
|
assert coll["allowPublicDownload"]
|
|
|
|
|
|
def test_get_public_collection(default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}",
|
|
headers={"Host": "localhost:30870"},
|
|
)
|
|
assert r.status_code == 200
|
|
coll = r.json()
|
|
|
|
assert coll["id"] == _public_coll_id
|
|
assert coll["oid"] == default_org_id
|
|
assert coll["access"] == "public"
|
|
assert coll["name"]
|
|
assert coll["created"]
|
|
assert coll["modified"]
|
|
assert coll["slug"] == PUBLIC_COLLECTION_SLUG
|
|
assert coll["resources"]
|
|
assert coll["dateEarliest"]
|
|
assert coll["dateLatest"]
|
|
assert coll["crawlCount"] > 0
|
|
assert coll["pageCount"] > 0
|
|
assert coll["uniquePageCount"] > 0
|
|
assert coll["totalSize"] > 0
|
|
|
|
for field in NON_PUBLIC_COLL_FIELDS:
|
|
assert field not in coll
|
|
|
|
assert coll["caption"] == CAPTION
|
|
|
|
assert coll["homeUrl"]
|
|
assert coll["homeUrlTs"]
|
|
|
|
assert coll["allowPublicDownload"] is False
|
|
|
|
thumbnail = coll["thumbnail"]
|
|
assert thumbnail
|
|
|
|
assert thumbnail["name"]
|
|
assert thumbnail["path"].startswith("http://localhost:30870/data/")
|
|
assert thumbnail["hash"]
|
|
assert thumbnail["size"]
|
|
assert thumbnail["mime"]
|
|
|
|
for field in NON_PUBLIC_IMAGE_FIELDS:
|
|
assert field not in thumbnail
|
|
|
|
# Invalid org slug - don't reveal whether org exists or not, use
|
|
# same exception as if collection doesn't exist
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/doesntexist/collections/{PUBLIC_COLLECTION_SLUG}"
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "collection_not_found"
|
|
|
|
# Unused slug
|
|
random_uuid = uuid4()
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/someslugnotinuse"
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "collection_not_found"
|
|
|
|
# Collection isn't public
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{UPDATED_SLUG}"
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "collection_not_found"
|
|
|
|
|
|
def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id):
|
|
# Make second public coll unlisted
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"access": "unlisted",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# Verify single public collection GET endpoint works for unlisted collection
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{SECOND_PUBLIC_COLL_SLUG}"
|
|
)
|
|
assert r.status_code == 200
|
|
coll = r.json()
|
|
|
|
assert coll["id"] == _second_public_coll_id
|
|
assert coll["oid"] == default_org_id
|
|
assert coll["access"] == "unlisted"
|
|
assert coll["name"]
|
|
assert coll["created"]
|
|
assert coll["modified"]
|
|
assert coll["slug"] == SECOND_PUBLIC_COLL_SLUG
|
|
assert coll["resources"]
|
|
assert coll["dateEarliest"]
|
|
assert coll["dateLatest"]
|
|
assert coll["crawlCount"] > 0
|
|
assert coll["pageCount"] > 0
|
|
assert coll["uniquePageCount"] > 0
|
|
assert coll["totalSize"] > 0
|
|
assert coll["defaultThumbnailName"] == "orange-default.avif"
|
|
assert coll["allowPublicDownload"]
|
|
|
|
for field in NON_PUBLIC_COLL_FIELDS:
|
|
assert field not in coll
|
|
|
|
|
|
def test_get_public_collection_unlisted_org_profile_disabled(
|
|
admin_auth_headers, default_org_id
|
|
):
|
|
# Disable org profile
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"enablePublicProfile": False,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# Verify we can still get public details for unlisted collection
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{SECOND_PUBLIC_COLL_SLUG}"
|
|
)
|
|
assert r.status_code == 200
|
|
coll = r.json()
|
|
|
|
assert coll["id"] == _second_public_coll_id
|
|
assert coll["oid"] == default_org_id
|
|
assert coll["access"] == "unlisted"
|
|
assert coll["name"]
|
|
assert coll["created"]
|
|
assert coll["modified"]
|
|
assert coll["slug"]
|
|
assert coll["resources"]
|
|
assert coll["dateEarliest"]
|
|
assert coll["dateLatest"]
|
|
assert coll["crawlCount"] > 0
|
|
assert coll["pageCount"] > 0
|
|
assert coll["uniquePageCount"] > 0
|
|
assert coll["totalSize"] > 0
|
|
assert coll["defaultThumbnailName"] == "orange-default.avif"
|
|
assert coll["allowPublicDownload"]
|
|
|
|
for field in NON_PUBLIC_COLL_FIELDS:
|
|
assert field not in coll
|
|
|
|
# Re-enable org profile
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"enablePublicProfile": True,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
|
|
def test_delete_thumbnail(crawler_auth_headers, default_org_id):
|
|
r = requests.delete(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["deleted"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json().get("thumbnail") is None
|
|
|
|
r = requests.delete(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}/thumbnail",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "thumbnail_not_found"
|
|
|
|
|
|
def test_unset_collection_home_url(
|
|
crawler_auth_headers, default_org_id, crawler_crawl_id
|
|
):
|
|
# Unset home url
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/home-url",
|
|
headers=crawler_auth_headers,
|
|
json={"pageId": None},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# Check that fields were set in collection as expected
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data.get("homeUrl") is None
|
|
assert data.get("homeUrlTs") is None
|
|
assert data.get("homeUrlPageId") is None
|
|
|
|
|
|
def test_download_streaming_public_collection(crawler_auth_headers, default_org_id):
|
|
# Check that download is blocked if allowPublicDownload is False
|
|
with requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}/download",
|
|
stream=True,
|
|
) as r:
|
|
assert r.status_code == 403
|
|
|
|
# Set allowPublicDownload to True and then check downloading works
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"allowPublicDownload": True,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
with TemporaryFile() as fh:
|
|
with requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}/download",
|
|
stream=True,
|
|
) as r:
|
|
assert r.status_code == 200
|
|
for chunk in r.iter_content():
|
|
fh.write(chunk)
|
|
|
|
fh.seek(0)
|
|
with ZipFile(fh, "r") as zip_file:
|
|
contents = zip_file.namelist()
|
|
|
|
assert len(contents) == 2
|
|
for filename in contents:
|
|
assert filename.endswith(".wacz") or filename == "datapackage.json"
|
|
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
|
|
|
|
|
|
def test_download_streaming_public_collection_profile_disabled(
|
|
admin_auth_headers, default_org_id
|
|
):
|
|
# Disable org public profile and ensure download still works for public collection
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/public-profile",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"enablePublicProfile": False,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
with TemporaryFile() as fh:
|
|
with requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}/download",
|
|
stream=True,
|
|
) as r:
|
|
assert r.status_code == 200
|
|
for chunk in r.iter_content():
|
|
fh.write(chunk)
|
|
|
|
fh.seek(0)
|
|
with ZipFile(fh, "r") as zip_file:
|
|
contents = zip_file.namelist()
|
|
|
|
assert len(contents) == 2
|
|
for filename in contents:
|
|
assert filename.endswith(".wacz") or filename == "datapackage.json"
|
|
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
|
|
|
|
|
|
def test_get_public_collection_slug_redirect(admin_auth_headers, default_org_id):
|
|
# Update public collection slug
|
|
new_slug = "new-slug"
|
|
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"slug": new_slug,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# Get public collection from previous slug
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}"
|
|
)
|
|
assert r.status_code == 200
|
|
coll = r.json()
|
|
|
|
assert coll["id"] == _public_coll_id
|
|
assert coll["oid"] == default_org_id
|
|
assert coll["slug"] == new_slug
|
|
|
|
# Rename second public collection slug to now-unused PUBLIC_COLLECTION_SLUG
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"slug": PUBLIC_COLLECTION_SLUG,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# Delete second public collection
|
|
r = requests.delete(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["success"]
|
|
|
|
# Verify that trying to go to PUBLIC_COLLECTION_SLUG now 404s instead of taking
|
|
# us to the collection that had the slug before it was reassigned
|
|
r = requests.get(
|
|
f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{PUBLIC_COLLECTION_SLUG}"
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
|
|
def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
|
# Delete second collection
|
|
r = requests.delete(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["success"]
|
|
|
|
# Verify collection id was removed from crawl
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert _second_coll_id not in r.json()["collectionIds"]
|
|
|
|
# Make a new empty (no crawls) collection and delete it
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"name": "To delete",
|
|
"description": "Deleting a collection with no crawls should work.",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["added"]
|
|
coll_id = data["id"]
|
|
|
|
r = requests.delete(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{coll_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["success"]
|