browsertrix/backend/test/test_run_crawl.py
2023-02-08 18:36:22 -08:00

153 lines
4.0 KiB
Python

import requests
import hashlib
import time
import io
import zipfile
from .conftest import API_PREFIX, HOST_PREFIX
wacz_path = None
wacz_size = None
wacz_hash = None
wacz_content = None
def test_list_orgs(admin_auth_headers, default_org_id):
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
data = r.json()
orgs = data["orgs"]
assert len(orgs) > 0
org_ids = []
for org in orgs:
org_ids.append(org["id"])
assert default_org_id in org_ids
def test_create_new_config(admin_auth_headers, default_org_id):
crawl_data = {
"runNow": True,
"name": "Test Crawl",
"config": {"seeds": ["https://webrecorder.net/"]},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
assert r.status_code == 200
data = r.json()
assert data["added"]
assert data["run_now_job"]
def test_wait_for_complete(admin_auth_headers, default_org_id, admin_crawl_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
assert data["state"] == "complete"
assert len(data["resources"]) == 1
assert data["resources"][0]["path"]
assert data["tags"] == ["wr-test-1", "wr-test-2"]
global wacz_path
global wacz_size
global wacz_hash
wacz_path = data["resources"][0]["path"]
wacz_size = data["resources"][0]["size"]
wacz_hash = data["resources"][0]["hash"]
def test_crawl_info(admin_auth_headers, default_org_id, admin_crawl_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
data = r.json()
assert data["fileSize"] == wacz_size
def test_download_wacz():
r = requests.get(HOST_PREFIX + wacz_path)
assert r.status_code == 200
assert len(r.content) == wacz_size
h = hashlib.sha256()
h.update(r.content)
assert h.hexdigest() == wacz_hash, (h.hexdigest(), wacz_hash)
global wacz_content
wacz_content = r.content
def test_verify_wacz():
b = io.BytesIO(wacz_content)
z = zipfile.ZipFile(b)
assert "pages/pages.jsonl" in z.namelist()
pages = z.open("pages/pages.jsonl").read().decode("utf-8")
assert '"https://webrecorder.net/"' in pages
def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert sorted(data["tags"]) == ["wr-test-1", "wr-test-2"]
# Add exception handling for old crawls without notes field
try:
assert not data["notes"]
except KeyError:
pass
# Submit patch request to update tags and notes
UPDATED_TAGS = ["wr-test-1-updated", "wr-test-2-updated"]
UPDATED_NOTES = "Lorem ipsum test note."
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
json={"tags": UPDATED_TAGS, "notes": UPDATED_NOTES},
)
assert r.status_code == 200
data = r.json()
assert data["success"]
# Verify update was successful
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
assert data["notes"] == UPDATED_NOTES
# Verify deleting works as well
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
json={"tags": [], "notes": None},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["tags"] == []
assert not data["notes"]