* tests: - fix cancel crawl test by ensuring state is not running or waiting - fix stop crawl test by ensuring stop is only initiated after at least one page has been crawled, otherwise result may be failed, as no crawl data has been crawled yet (separate fix in crawler to avoid loop if stopped before any data written webrecorder/browsertrix-crawler#314) - bump page limit to 4 for tests to ensure crawl is partially complete, not fully complete when stopping - allow canceled or partial_complete due to race condition * chart: bump frontend limits in default, not just for tests (addresses #780) * crawl stop before starting: - if crawl stopped before it started, mark as canceled - add test for stopping immediately, which should result in 'canceled' crawl - attempt to increase resync interval for immediate failure - nightly tests: increase page limit to test timeout * backend: - detect stopped-before-start crawl as 'failed' instead of 'done' - stats: return stats counters as int instead of string
300 lines
8.5 KiB
Python
300 lines
8.5 KiB
Python
import requests
|
|
import hashlib
|
|
import time
|
|
import io
|
|
import zipfile
|
|
import re
|
|
|
|
from .conftest import API_PREFIX, HOST_PREFIX
|
|
from .test_collections import UPDATED_NAME as COLLECTION_NAME
|
|
|
|
wacz_path = None
|
|
wacz_size = None
|
|
wacz_hash = None
|
|
|
|
wacz_content = None
|
|
|
|
|
|
def test_list_orgs(admin_auth_headers, default_org_id):
|
|
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
|
|
data = r.json()
|
|
|
|
orgs = data["items"]
|
|
assert len(orgs) > 0
|
|
assert data["total"] > 0
|
|
|
|
org_ids = []
|
|
for org in orgs:
|
|
org_ids.append(org["id"])
|
|
assert default_org_id in org_ids
|
|
|
|
|
|
def test_create_new_config(admin_auth_headers, default_org_id):
|
|
crawl_data = {
|
|
"runNow": False,
|
|
"name": "Test Crawl",
|
|
"config": {"seeds": [{"url": "https://webrecorder.net/"}]},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["added"]
|
|
assert data["run_now_job"] == None
|
|
|
|
|
|
def test_wait_for_complete(admin_auth_headers, default_org_id, admin_crawl_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["state"] == "complete"
|
|
|
|
assert len(data["resources"]) == 1
|
|
assert data["resources"][0]["path"]
|
|
|
|
# ensure filename matches specified pattern
|
|
# set in default_crawl_filename_template
|
|
assert re.search("/[\\d]+-testing-[\\w-]+\\.wacz", data["resources"][0]["path"])
|
|
|
|
assert data["tags"] == ["wr-test-1", "wr-test-2"]
|
|
|
|
global wacz_path
|
|
global wacz_size
|
|
global wacz_hash
|
|
wacz_path = data["resources"][0]["path"]
|
|
wacz_size = data["resources"][0]["size"]
|
|
wacz_hash = data["resources"][0]["hash"]
|
|
|
|
|
|
def test_crawl_info(admin_auth_headers, default_org_id, admin_crawl_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["fileSize"] == wacz_size
|
|
assert data["description"] == "Admin Test Crawl description"
|
|
|
|
|
|
def test_crawls_include_seed_info(admin_auth_headers, default_org_id, admin_crawl_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["firstSeed"] == "https://webrecorder.net/"
|
|
assert data["seedCount"] == 1
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
crawls = data["items"]
|
|
assert crawls
|
|
for crawl in crawls:
|
|
assert crawl["firstSeed"]
|
|
assert crawl["seedCount"] > 0
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/all/crawls?runningOnly=0",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
crawls = data["items"]
|
|
assert crawls
|
|
for crawl in crawls:
|
|
assert crawl["firstSeed"]
|
|
assert crawl["seedCount"] > 0
|
|
|
|
|
|
def test_download_wacz():
|
|
r = requests.get(HOST_PREFIX + wacz_path)
|
|
assert r.status_code == 200
|
|
assert len(r.content) == wacz_size
|
|
|
|
h = hashlib.sha256()
|
|
h.update(r.content)
|
|
assert h.hexdigest() == wacz_hash, (h.hexdigest(), wacz_hash)
|
|
|
|
global wacz_content
|
|
wacz_content = r.content
|
|
|
|
|
|
def test_verify_wacz():
|
|
b = io.BytesIO(wacz_content)
|
|
z = zipfile.ZipFile(b)
|
|
|
|
assert "pages/pages.jsonl" in z.namelist()
|
|
|
|
# 1 seed page
|
|
pages = z.open("pages/pages.jsonl").read().decode("utf-8")
|
|
assert '"https://webrecorder.net/"' in pages
|
|
|
|
# 1 seed page + header line
|
|
assert len(pages.strip().split("\n")) == 2
|
|
|
|
# 1 other page
|
|
pages = z.open("pages/extraPages.jsonl").read().decode("utf-8")
|
|
assert '"https://webrecorder.net/blog"' in pages
|
|
|
|
# 3 other page + header line
|
|
assert len(pages.strip().split("\n")) == 4
|
|
|
|
|
|
def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert sorted(data["tags"]) == ["wr-test-1", "wr-test-2"]
|
|
# Add exception handling for old crawls without notes field
|
|
try:
|
|
assert not data["notes"]
|
|
except KeyError:
|
|
pass
|
|
|
|
# Submit patch request to update tags and notes
|
|
UPDATED_TAGS = ["wr-test-1-updated", "wr-test-2-updated"]
|
|
UPDATED_NOTES = "Lorem ipsum test note."
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
json={"tags": UPDATED_TAGS, "notes": UPDATED_NOTES},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["success"]
|
|
|
|
# Verify update was successful
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
|
|
assert data["notes"] == UPDATED_NOTES
|
|
|
|
# Verify deleting works as well
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
json={"tags": [], "notes": None},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["tags"] == []
|
|
assert not data["notes"]
|
|
|
|
|
|
def test_delete_crawls_crawler(
|
|
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
|
|
):
|
|
# Test that crawl is in collection before deleting
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
collection = [coll for coll in data["items"] if coll["name"] == COLLECTION_NAME][0]
|
|
crawl_ids = collection["crawlIds"]
|
|
assert admin_crawl_id in crawl_ids
|
|
assert crawler_crawl_id in crawl_ids
|
|
|
|
# Test that crawler user can't delete another user's crawls
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
|
headers=crawler_auth_headers,
|
|
json={"crawl_ids": [admin_crawl_id]},
|
|
)
|
|
assert r.status_code == 403
|
|
data = r.json()
|
|
assert data["detail"] == "Not Allowed"
|
|
|
|
# Test that crawler user can delete own crawl
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
|
headers=crawler_auth_headers,
|
|
json={"crawl_ids": [crawler_crawl_id]},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["deleted"] == 1
|
|
|
|
# Test that crawl is no longer in collection
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
collection = [coll for coll in data["items"] if coll["name"] == COLLECTION_NAME][0]
|
|
crawl_ids = collection["crawlIds"]
|
|
assert admin_crawl_id in crawl_ids
|
|
assert crawler_crawl_id not in crawl_ids
|
|
|
|
# Test that crawl is not found after deleting
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
|
|
def test_delete_crawls_org_owner(
|
|
admin_auth_headers,
|
|
crawler_auth_headers,
|
|
default_org_id,
|
|
admin_crawl_id,
|
|
crawler_crawl_id,
|
|
wr_specs_crawl_id,
|
|
):
|
|
# Test that org owner can delete own crawl
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
|
headers=admin_auth_headers,
|
|
json={"crawl_ids": [admin_crawl_id]},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["deleted"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
# Test that org owner can delete another org user's crawls
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
|
headers=admin_auth_headers,
|
|
json={"crawl_ids": [wr_specs_crawl_id]},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["deleted"] == 1
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{wr_specs_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|