browsertrix/backend/test/test_run_crawl.py
Tessa Walsh dc41468daf
Allow users to run crawls with 1 or 2 browser windows (#2627)
Fixes #2425 

## Changed

- Switch backend to primarily using number of browser windows rather
than scale multiplier (including migration to calculate `browserWindows`
from `scale` for existing workflows and crawls)
- Still support `scale` in addition to `browserWindows` in input models
for creating and updating workflows and re-adjusting live crawl scale
for backwards compatibility
- Adds new `max_browser_windows` value to Helm chart, but calculates the
value from `max_crawl_scale` as fallback for users with that value
already set in local charts
- Rework frontend to allow users to select multiples of
`crawler_browser_instances` or any value below
`crawler_browser_instances` for browser windows. For instance, with
`crawler_browser_instances=4` and `max_browser_windows=8`, the user
would be presented with the following options: 1, 2, 3, 4, 8
- Sets maximum width of screencast to image width returned by `message`

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
Co-authored-by: sua yoo <sua@suayoo.com>
Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
2025-06-03 13:37:30 -07:00

1369 lines
41 KiB
Python

import requests
import hashlib
import time
import io
import zipfile
import re
import csv
import codecs
import json
from tempfile import TemporaryFile
from zipfile import ZipFile, ZIP_STORED
import pytest
from .conftest import API_PREFIX, HOST_PREFIX, FINISHED_STATES
from .test_collections import UPDATED_NAME as COLLECTION_NAME
wacz_path = None
wacz_size = None
wacz_hash = None
wacz_content = None
page_id = None
# newly started crawl for this test suite
# (not using the fixture to be able to test running crawl)
admin_crawl_id = None
def test_list_orgs(admin_auth_headers, default_org_id):
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
data = r.json()
orgs = data["items"]
assert len(orgs) > 0
assert data["total"] > 0
org_ids = []
for org in orgs:
org_ids.append(org["id"])
assert default_org_id in org_ids
def test_create_new_config(admin_auth_headers, default_org_id):
crawl_data = {
"runNow": False,
"name": "Test Crawl",
"config": {"seeds": [{"url": "https://old.webrecorder.net/"}]},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
assert r.status_code == 200
data = r.json()
assert data["added"]
assert data["run_now_job"] == None
assert data["storageQuotaReached"] is False
def test_start_crawl(admin_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "Admin Test Crawl",
"description": "Admin Test Crawl description",
"tags": ["wr-test-1", "wr-test-2"],
"config": {
"seeds": [{"url": "https://old.webrecorder.net/", "depth": 1}],
"exclude": "community",
# limit now set via 'max_pages_per_crawl' global limit
# "limit": 1,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
global admin_crawl_id
admin_crawl_id = data["run_now_job"]
def test_wait_for_running(admin_auth_headers, default_org_id):
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "running":
break
time.sleep(2)
def test_crawl_queue(admin_auth_headers, default_org_id):
# 422 - requires offset and count
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/queue",
headers=admin_auth_headers,
)
assert r.status_code == 422
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/queue?offset=0&count=20",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
if data["total"] > 0:
break
assert len(data["results"]) > 0
assert data["results"][0].startswith("https://")
def test_crawl_queue_match(admin_auth_headers, default_org_id):
# 422, regex required
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/queueMatchAll",
headers=admin_auth_headers,
)
assert r.status_code == 422
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/queueMatchAll?regex=webrecorder&offset=0",
headers=admin_auth_headers,
)
data = r.json()
assert data["total"] > 0
assert len(data["matched"]) > 0
assert data["matched"][0].startswith("https://")
def test_add_exclusion(admin_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=test",
headers=admin_auth_headers,
)
assert r.json()["success"] == True
def test_add_invalid_exclusion(admin_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=[",
headers=admin_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
def test_remove_exclusion(admin_auth_headers, default_org_id):
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=test",
headers=admin_auth_headers,
)
assert r.json()["success"] == True
def test_wait_for_complete(admin_auth_headers, default_org_id):
state = None
data = None
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] in FINISHED_STATES:
state = data["state"]
break
time.sleep(5)
assert data["state"] == "complete"
assert len(data["resources"]) == 1
assert data["resources"][0]["path"]
assert len(data["initialPages"]) == 4
assert data["pagesQueryUrl"].endswith(
f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pagesSearch"
)
assert data["downloadUrl"] is None
# ensure filename matches specified pattern
# set in default_crawl_filename_template
assert re.search("/[\\d]+-testing-[\\w-]+\\.wacz", data["resources"][0]["path"])
assert data["tags"] == ["wr-test-1", "wr-test-2"]
global wacz_path
global wacz_size
global wacz_hash
wacz_path = data["resources"][0]["path"]
wacz_size = data["resources"][0]["size"]
wacz_hash = data["resources"][0]["hash"]
def test_queue_and_exclusions_error_crawl_not_running(
admin_auth_headers, default_org_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/queue?offset=0&count=20",
headers=admin_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "crawl_not_running"
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/queueMatchAll?regex=webrecorder&offset=0",
headers=admin_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "crawl_not_running"
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=test2",
headers=admin_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "crawl_not_running"
def test_crawl_info(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
data = r.json()
assert data["fileSize"] == wacz_size
assert data["fileCount"] == 1
assert data["userName"]
assert data["version"] == 2
assert data["scale"] == 1
assert data["browserWindows"] == 2
def test_crawls_include_seed_info(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
data = r.json()
assert data["firstSeed"] == "https://old.webrecorder.net/"
assert data["seedCount"] == 1
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls",
headers=admin_auth_headers,
)
data = r.json()
crawls = data["items"]
assert crawls
for crawl in crawls:
assert crawl["firstSeed"]
assert crawl["seedCount"] > 0
r = requests.get(
f"{API_PREFIX}/orgs/all/crawls?runningOnly=0",
headers=admin_auth_headers,
)
data = r.json()
crawls = data["items"]
assert crawls
for crawl in crawls:
assert crawl["firstSeed"]
assert crawl["seedCount"] > 0
def test_crawl_seeds_endpoint(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/seeds",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
assert data["items"][0]["url"] == "https://old.webrecorder.net/"
assert data["items"][0]["depth"] == 1
def test_crawls_exclude_errors(admin_auth_headers, default_org_id):
# Get endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("errors") == []
# replay.json endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("errors") == []
# List endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls",
headers=admin_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
for crawl in crawls:
assert data.get("errors") == []
def test_crawls_exclude_full_seeds(admin_auth_headers, default_org_id):
# Get endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
config = data.get("config")
assert config is None or config.get("seeds") is None
# replay.json endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
config = r.json().get("config")
assert config is None or config.get("seeds") is None
# List endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls",
headers=admin_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
for crawl in crawls:
config = crawl.get("config")
assert config is None or config.get("seeds") is None
def test_crawls_include_file_error_page_counts(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
assert data["filePageCount"] >= 0
assert data["errorPageCount"] >= 0
def test_download_wacz():
r = requests.get(HOST_PREFIX + wacz_path)
assert r.status_code == 200
assert len(r.content) == wacz_size
h = hashlib.sha256()
h.update(r.content)
assert h.hexdigest() == wacz_hash, (h.hexdigest(), wacz_hash)
global wacz_content
wacz_content = r.content
def test_verify_wacz():
b = io.BytesIO(wacz_content)
z = zipfile.ZipFile(b)
assert "pages/pages.jsonl" in z.namelist()
# 1 seed page
pages = z.open("pages/pages.jsonl").read().decode("utf-8")
assert '"https://old.webrecorder.net/"' in pages
# 1 seed page + header line
assert len(pages.strip().split("\n")) == 2
# 1 other page
pages = z.open("pages/extraPages.jsonl").read().decode("utf-8")
assert '"https://old.webrecorder.net/blog"' in pages
# 3 other page + header line
assert len(pages.strip().split("\n")) == 4
@pytest.mark.parametrize(
"type_path",
[
# crawls endpoint
("crawls"),
# all-crawls endpoint
("all-crawls"),
],
)
def test_download_wacz_crawls(
admin_auth_headers, default_org_id, admin_crawl_id, type_path
):
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/{type_path}/{admin_crawl_id}/download",
headers=admin_auth_headers,
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)
fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()
assert len(contents) >= 2
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
if filename == "datapackage.json":
data = zip_file.read(filename).decode("utf-8")
datapackage = json.loads(data)
assert len(datapackage["resources"]) == 1
for resource in datapackage["resources"]:
assert resource["name"] == resource["path"]
assert resource["hash"]
assert resource["bytes"]
def test_update_crawl(
admin_auth_headers,
default_org_id,
admin_crawl_id,
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert sorted(data["tags"]) == ["wr-test-1", "wr-test-2"]
assert len(data["collectionIds"]) == 1
# Make new collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=admin_auth_headers,
json={"name": "Crawl Update Test Collection"},
)
new_coll_id = r.json()["id"]
# Submit patch request
UPDATED_TAGS = ["wr-test-1-updated", "wr-test-2-updated"]
UPDATED_DESC = "Lorem ipsum test note."
UPDATED_NAME = "Updated crawl name"
UPDATED_COLLECTION_IDS = [new_coll_id]
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
json={
"tags": UPDATED_TAGS,
"description": UPDATED_DESC,
"name": UPDATED_NAME,
"collectionIds": UPDATED_COLLECTION_IDS,
},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
# Verify update was successful
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
assert data["description"] == UPDATED_DESC
assert data["name"] == UPDATED_NAME
assert data["collectionIds"] == UPDATED_COLLECTION_IDS
assert data.get("reviewStatus") is None
# Update reviewStatus and verify
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
json={
"reviewStatus": 5,
},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["reviewStatus"] == 5
# Test sorting on reviewStatus
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=reviewStatus",
headers=admin_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == admin_crawl_id
assert crawls[0]["reviewStatus"] == 5
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=reviewStatus&sortDirection=1",
headers=admin_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[-1]["id"] == admin_crawl_id
assert crawls[-1]["reviewStatus"] == 5
# Test sorting on reviewStatus for all-crawls
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=reviewStatus",
headers=admin_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == admin_crawl_id
assert crawls[0]["reviewStatus"] == 5
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=reviewStatus&sortDirection=1",
headers=admin_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[-1]["id"] == admin_crawl_id
assert crawls[-1]["reviewStatus"] == 5
# Try to update to invalid reviewStatus
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
json={
"reviewStatus": "invalid",
},
)
assert r.status_code == 422
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["reviewStatus"] == 5
# Verify deleting works as well
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
json={"tags": [], "description": None},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["tags"] == []
assert not data["description"]
def test_crawl_stats_all_orgs_not_superadmin(crawler_auth_headers):
r = requests.get(
f"{API_PREFIX}/orgs/all/crawls/stats", headers=crawler_auth_headers
)
assert r.status_code == 403
def test_crawl_stats_all_orgs(admin_auth_headers):
with requests.get(
f"{API_PREFIX}/orgs/all/crawls/stats", headers=admin_auth_headers, stream=True
) as r:
assert r.status_code == 200
# Wait for stream content
if not r.content:
while True:
if r.content:
break
time.sleep(5)
buffer = r.iter_lines()
for row in csv.DictReader(
codecs.iterdecode(buffer, "utf-8"), skipinitialspace=True
):
assert row["id"]
assert row["oid"]
assert row["org"]
assert row["cid"]
assert row["name"] or row["name"] == ""
assert row["state"]
assert row["userid"]
assert row["user"]
assert row["started"]
assert row["finished"] or row["finished"] is None
assert row["duration"] or row["duration"] == 0
assert row["pages"] or row["pages"] == 0
assert row["filesize"] or row["filesize"] == 0
assert row["avg_page_time"] or row["avg_page_time"] == 0
def test_crawl_stats(crawler_auth_headers, default_org_id):
with requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/stats",
headers=crawler_auth_headers,
stream=True,
) as r:
assert r.status_code == 200
# Wait for stream content
if not r.content:
while True:
if r.content:
break
time.sleep(5)
buffer = r.iter_lines()
for row in csv.DictReader(
codecs.iterdecode(buffer, "utf-8"), skipinitialspace=True
):
assert row["id"]
assert row["oid"] == default_org_id
assert row["org"]
assert row["cid"]
assert row["name"] or row["name"] == ""
assert row["state"]
assert row["userid"]
assert row["user"]
assert row["finished"] or row["finished"] is None
assert row["duration"] or row["duration"] == 0
assert row["pages"] or row["pages"] == 0
assert row["filesize"] or row["filesize"] == 0
assert row["avg_page_time"] or row["avg_page_time"] == 0
started = row["started"]
assert started
assert started.endswith("Z")
def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
# Test GET list endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
pages = data["items"]
assert pages
for page in pages:
assert page["id"]
assert page["oid"]
assert page["crawl_id"]
assert page["url"]
assert page["ts"]
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["status"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)
# Test GET page endpoint
global page_id
test_page = pages[0]
page_id = test_page["id"]
test_page_url = test_page["url"]
test_page_ts = test_page["ts"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
page = r.json()
assert page["id"] == page_id
assert page["oid"]
assert page["crawl_id"]
assert page["url"]
assert page["ts"]
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)
assert page["notes"] == []
assert page.get("userid") is None
assert page.get("modified") is None
assert page.get("approved") is None
# Test exact url filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?url={test_page_url}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
for matching_page in data["items"]:
assert matching_page["url"] == test_page_url
# Test exact url and ts filters together
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?url={test_page_url}&ts={test_page_ts}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
for matching_page in data["items"]:
assert matching_page["url"] == test_page_url
assert matching_page["ts"] == test_page_ts
# Test urlPrefix filter
url_prefix = test_page_url[:8]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?urlPrefix={url_prefix}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 1
found_matching_page = False
for page in data["items"]:
if page["id"] == page_id and page["url"] == test_page_url:
found_matching_page = True
assert found_matching_page
# Test isSeed filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?isSeed=True",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
for page in data["items"]:
assert page["isSeed"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?isSeed=False",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 2
for page in data["items"]:
assert page["isSeed"] is False
# Test depth filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?depth=0",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 1
for page in data["items"]:
assert page["depth"] == 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?depth=1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 2
for page in data["items"]:
assert page["depth"] == 1
def test_crawl_pages_qa_filters(crawler_auth_headers, default_org_id, crawler_crawl_id):
# Test reviewed filter (page has no notes or approved so should show up in false)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=False",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 3
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 0
# Update page with approval
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
headers=crawler_auth_headers,
json={
"approved": True,
},
)
assert r.status_code == 200
assert r.json()["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["approved"]
# Test approval filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=True",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 1
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=False",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=True,False",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 1
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=None",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 2
# Test reviewed filter (page now approved so should show up in True, other pages show here)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=False",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 2
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 1
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
page = r.json()
assert page["id"] == page_id
assert page["oid"]
assert page["crawl_id"]
assert page["url"]
assert page["ts"]
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)
assert page["notes"] == []
assert page["userid"]
assert page["approved"]
modified = page["modified"]
assert modified
assert modified.endswith("Z")
# Set approved to False and test filter again
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
headers=crawler_auth_headers,
json={
"approved": False,
},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=True",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=False",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 1
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=True,False",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 1
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=None",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 2
def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
# Store page counts to compare against after re-adding
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
page_count_before = data["pageCount"]
page_count_before_unique = data["uniquePageCount"]
page_count_before_files = data["filePageCount"]
page_count_before_errors = data["errorPageCount"]
# Re-add pages and verify they were correctly added
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/reAdd",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["started"]
time.sleep(10)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] >= 0
pages = data["items"]
assert pages
for page in pages:
assert page["id"]
assert page["oid"]
assert page["crawl_id"]
assert page["url"]
assert page["ts"]
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["status"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)
# Ensure only superuser can re-add pages for all crawls in an org
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/all/pages/reAdd",
headers=crawler_auth_headers,
)
assert r.status_code == 403
# Check that crawl page counts were recalculated properly
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["pageCount"] > 0 and data["pageCount"] == page_count_before
assert (
data["uniquePageCount"] > 0
and data["uniquePageCount"] == page_count_before_unique
)
assert data["filePageCount"] == page_count_before_files
assert data["errorPageCount"] == page_count_before_errors
def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id):
note_text = "testing"
updated_note_text = "updated"
untouched_text = "untouched"
# Add note
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes",
headers=crawler_auth_headers,
json={"text": note_text},
)
assert r.status_code == 200
assert r.json()["added"]
# Check that note was added
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert len(data["notes"]) == 1
first_note = data["notes"][0]
first_note_id = first_note["id"]
assert first_note_id
assert first_note["created"]
assert first_note["userid"]
assert first_note["userName"]
assert first_note["text"] == note_text
# Make sure page approval is set to None and re-test filters
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
headers=crawler_auth_headers,
json={
"approved": None,
},
)
assert r.status_code == 200
assert r.json()["updated"]
# Test approved filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=True",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=False",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=True,False",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=None",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 3
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=true,false,none",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 3
# Test reviewed filter (page now has notes so should show up in True)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=False",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 2
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 1
# Test notes filter
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?hasNotes=False",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 2
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?hasNotes=True",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 1
# Add second note to test selective updates/deletes
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes",
headers=crawler_auth_headers,
json={"text": untouched_text},
)
assert r.status_code == 200
assert r.json()["added"]
# Edit first note
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes",
headers=crawler_auth_headers,
json={"text": updated_note_text, "id": first_note_id},
)
assert r.status_code == 200
assert r.json()["updated"]
# Verify notes look as expected
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
notes = data["notes"]
assert len(notes) == 2
updated_note = [note for note in notes if note["id"] == first_note_id][0]
assert updated_note["text"] == updated_note_text
second_note_id = [note["id"] for note in notes if note["text"] == untouched_text][0]
assert second_note_id
# Delete both notes
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes/delete",
headers=crawler_auth_headers,
json={"delete_list": [first_note_id, second_note_id]},
)
assert r.status_code == 200
assert r.json()["deleted"]
# Verify notes were deleted
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
notes = data.get("notes")
assert notes == []
def test_delete_crawls_crawler(crawler_auth_headers, default_org_id, crawler_crawl_id):
# Test that crawler user can't delete another user's crawls
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=crawler_auth_headers,
json={"crawl_ids": [admin_crawl_id]},
)
assert r.status_code == 403
data = r.json()
assert data["detail"] == "not_allowed"
# Check that pages exist for crawl
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] > 0
# Get WACZ presigned url for crawl about to delete
wacz_presigned_urls = []
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert len(data["resources"]) >= 1
for resource in data["resources"]:
wacz_presigned_urls.append(resource["path"])
# Test that crawler user can delete own crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=crawler_auth_headers,
json={"crawl_ids": [crawler_crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"] == 1
assert data["storageQuotaReached"] is False
time.sleep(5)
# Test that crawl is not found after deleting
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 404
# Test that WACZs are deleted
for wacz_url in wacz_presigned_urls:
r = requests.get(f"http://localhost:30870{wacz_url}")
assert r.status_code == 404
# Test that associated pages are also deleted
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] == 0
def test_delete_crawls_org_owner(
admin_auth_headers,
crawler_auth_headers,
default_org_id,
admin_crawl_id,
crawler_crawl_id,
wr_specs_crawl_id,
):
# Test that org owner can delete own crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=admin_auth_headers,
json={"crawl_ids": [admin_crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"]
assert data["storageQuotaReached"] is False
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 404
# Test that org owner can delete another org user's crawls
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=admin_auth_headers,
json={"crawl_ids": [wr_specs_crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"] == 1
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{wr_specs_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 404
def test_custom_behavior_logs(
custom_behaviors_crawl_id, crawler_auth_headers, default_org_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{custom_behaviors_crawl_id}/behaviorLogs",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
custom_log_line_count = 0
assert data["total"] > 0
for log in data["items"]:
assert log["timestamp"]
assert log["context"] in ("behavior", "behaviorScript", "behaviorScriptCustom")
if log["context"] == "behaviorScriptCustom":
assert log["message"] in (
"test-stat",
"In Test Behavior!",
)
if log["message"] in ("test-stat", "done!"):
assert log["details"]["behavior"] == "TestBehavior"
assert log["details"]["page"] == "https://specs.webrecorder.net/"
custom_log_line_count += 1
assert custom_log_line_count == 2
def test_crawls_exclude_behavior_logs(
custom_behaviors_crawl_id, admin_auth_headers, default_org_id
):
# Get endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{custom_behaviors_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("behaviorLogs") == []
# replay.json endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{custom_behaviors_crawl_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("behaviorLogs") == []
# List endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls",
headers=admin_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
for crawl in crawls:
assert data.get("behaviorLogs") == []