browsertrix/backend/test/test_crawlconfigs.py
Tessa Walsh cd7b695520
Add backend support for custom behaviors + validation endpoint (#2505)
Backend support for #2151 

Adds support for specifying custom behaviors via a list of strings.

When workflows are added or modified, minimal backend validation is done
to ensure that all custom behavior URLs are valid URLs (after removing
the git prefix and custom query arguments).

A separate `POST /crawlconfigs/validate/custom-behavior` endpoint is
also added, which can be used to validate a custom behavior URL. It
performs the same syntax check as above and then:
- For URL directly to behavior file, ensures URL resolves and returns a
2xx/3xx status code
- For Git repositories, uses `git ls-remote` to ensure they exist (and
that branch exists if specified)

---------

Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
2025-04-02 16:20:51 -07:00

735 lines
22 KiB
Python

import time
import requests
from .conftest import API_PREFIX
cid = None
UPDATED_NAME = "Updated name"
UPDATED_DESCRIPTION = "Updated description"
UPDATED_TAGS = ["tag3", "tag4"]
_coll_id = None
_admin_crawl_cid = None
def test_crawl_config_usernames(
crawler_auth_headers, default_org_id, crawler_config_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["createdByName"]
assert data["modifiedByName"]
assert data["lastStartedByName"]
created = data["created"]
assert created
assert created.endswith("Z")
modified = data["modified"]
assert modified
assert modified.endswith("Z")
def test_add_crawl_config(crawler_auth_headers, default_org_id, sample_crawl_data):
# Create crawl config
sample_crawl_data["schedule"] = "0 0 * * *"
sample_crawl_data["profileid"] = ""
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
data = r.json()
global cid
cid = data["id"]
def test_update_name_only(crawler_auth_headers, default_org_id):
# update name only
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"name": "updated name 1"},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
assert data["metadata_changed"] == True
assert data["settings_changed"] == False
def test_update_desription_only(crawler_auth_headers, default_org_id):
# update description only
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"description": "updated description"},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
assert data["metadata_changed"] == True
assert data["settings_changed"] == False
def test_update_crawl_config_metadata(crawler_auth_headers, default_org_id):
# Make a new collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [],
"name": "autoAddUpdate",
},
)
assert r.status_code == 200
data = r.json()
global _coll_id
_coll_id = data["id"]
assert _coll_id
# Update crawl config
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"name": UPDATED_NAME,
"description": UPDATED_DESCRIPTION,
"tags": UPDATED_TAGS,
"autoAddCollections": [_coll_id],
},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
assert data["metadata_changed"] == True
assert data["settings_changed"] == False
def test_verify_update(crawler_auth_headers, default_org_id):
# Verify update was successful
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["name"] == UPDATED_NAME
assert data["description"] == UPDATED_DESCRIPTION
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
assert data["autoAddCollections"] == [_coll_id]
assert data["firstSeed"] == "https://example.com/"
def test_update_config_invalid_format(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"config": {
"seeds": ["https://example.com/"],
"scopeType": "domain",
"limit": 10,
}
},
)
assert r.status_code == 422
def test_update_config_invalid_exclude_regex(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"exclude": "["}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"exclude": ["abc.*", "["]}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
def test_verify_default_select_links(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["config"]["selectLinks"] == ["a[href]->href"]
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"config": {
"seeds": [{"url": "https://example.com/"}],
"scopeType": "domain",
"selectLinks": ["a[href]->href", "script[src]->src"],
}
},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["config"]["scopeType"] == "domain"
assert data["config"]["selectLinks"] == ["a[href]->href", "script[src]->src"]
def test_update_config_no_changes(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"config": {
"seeds": [{"url": "https://example.com/"}],
"scopeType": "domain",
"selectLinks": ["a[href]->href", "script[src]->src"],
}
},
)
assert r.status_code == 200
data = r.json()
assert data["settings_changed"] == False
assert data["metadata_changed"] == False
def test_update_crawl_timeout(crawler_auth_headers, default_org_id, sample_crawl_data):
# Verify that updating crawl timeout works
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"crawlTimeout": 60},
)
assert r.status_code == 200
data = r.json()
assert data["settings_changed"] == True
assert data["metadata_changed"] == False
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlTimeout"] == 60
def test_update_max_crawl_size(crawler_auth_headers, default_org_id, sample_crawl_data):
# Verify that updating crawl timeout works
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"maxCrawlSize": 4096},
)
assert r.status_code == 200
data = r.json()
assert data["settings_changed"] == True
assert data["metadata_changed"] == False
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["maxCrawlSize"] == 4096
def test_verify_delete_tags(crawler_auth_headers, default_org_id):
# Verify that deleting tags and name works as well
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"tags": [], "name": None},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert not data["name"]
assert data["tags"] == []
def test_verify_revs_history(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/revs",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert len(items) == 3
sorted_data = sorted(items, key=lambda revision: revision["rev"])
assert sorted_data[0]["config"]["scopeType"] == "prefix"
def test_workflow_total_size_and_last_crawl_stats(
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] > 0
items = data["items"]
for workflow in items:
assert workflow.get("config") is None
assert workflow["seedCount"]
assert workflow["firstSeed"]
last_crawl_id = workflow.get("lastCrawlId")
if last_crawl_id and last_crawl_id in (admin_crawl_id, crawler_crawl_id):
assert workflow["totalSize"] > 0
assert workflow["crawlCount"] > 0
assert workflow["crawlSuccessfulCount"] > 0
assert workflow["lastCrawlId"]
assert workflow["lastCrawlStartTime"]
assert workflow["lastStartedByName"]
assert workflow["lastCrawlTime"]
assert workflow["lastCrawlState"]
assert workflow["lastRun"]
assert workflow["lastCrawlSize"] > 0
if last_crawl_id == admin_crawl_id:
global _admin_crawl_cid
_admin_crawl_cid = workflow["id"]
assert _admin_crawl_cid
else:
assert workflow["totalSize"] == 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["totalSize"] > 0
assert data["crawlCount"] > 0
assert data["crawlSuccessfulCount"] > 0
assert data["lastCrawlId"]
assert data["lastCrawlStartTime"]
assert data["lastStartedByName"]
assert data["lastCrawlTime"]
assert data["lastCrawlState"]
assert data["lastRun"]
assert data["lastCrawlSize"] > 0
def test_incremental_workflow_total_size_and_last_crawl_stats(
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
):
# Get baseline values
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlCount"] == 1
assert data["crawlSuccessfulCount"] == 1
total_size = data["totalSize"]
last_crawl_id = data["lastCrawlId"]
last_crawl_started = data["lastCrawlStartTime"]
last_crawl_finished = data["lastCrawlTime"]
last_run = data["lastRun"]
# Run new crawl in this workflow
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/run",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawl_id = r.json()["started"]
# Wait for it to complete
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
if data["state"] == "complete":
break
time.sleep(5)
# Give time for stats to re-compute
time.sleep(10)
# Re-check stats
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlCount"] == 2
assert data["crawlSuccessfulCount"] == 2
assert data["totalSize"] > total_size
assert data["lastCrawlId"] == crawl_id
assert data["lastCrawlStartTime"] > last_crawl_started
assert data["lastCrawlTime"] > last_crawl_finished
assert data["lastRun"] > last_run
# Delete new crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=crawler_auth_headers,
json={"crawl_ids": [crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"] == 1
# Re-check stats
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlCount"] == 1
assert data["crawlSuccessfulCount"] == 1
assert data["totalSize"] == total_size
assert data["lastCrawlId"] == last_crawl_id
assert data["lastCrawlStartTime"] == last_crawl_started
assert data["lastCrawlTime"] == last_crawl_finished
assert data["lastRun"] == last_run
def test_get_config_seeds(crawler_auth_headers, default_org_id, url_list_config_id):
# Make sure seeds aren't included in the crawlconfig detail
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json().get("config").get("seeds") is None
# Test getting seeds from separate endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
EXPECTED_SEED_URLS = [
"https://webrecorder.net/",
"https://example.com/",
"https://specs.webrecorder.net/",
]
found_seed_urls = []
for item in data["items"]:
found_seed_urls.append(item["url"])
assert sorted(found_seed_urls) == sorted(EXPECTED_SEED_URLS)
# Test getting seeds with low page size
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds?pageSize=2",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert len(items) == 2
for item in items:
assert item["url"] in EXPECTED_SEED_URLS
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds?pageSize=2&page=2",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert len(items) == 1
assert items[0]["url"] in EXPECTED_SEED_URLS
def test_get_crawler_channels(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/crawler-channels",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawler_channels = r.json()["channels"]
assert crawler_channels
assert len(crawler_channels) == 2
for crawler_channel in crawler_channels:
assert crawler_channel["id"]
assert crawler_channel["image"]
def test_add_crawl_config_invalid_exclude_regex(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["config"]["exclude"] = "["
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
sample_crawl_data["config"]["exclude"] = ["abc.*", "["]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
def test_add_crawl_config_custom_behaviors_invalid_url(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["config"]["customBehaviors"] = ["http"]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_custom_behavior"
def test_add_crawl_config_custom_behaviors_valid_url(
crawler_auth_headers, default_org_id, sample_crawl_data
):
url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js"
sample_crawl_data["config"]["customBehaviors"] = [url]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
data = r.json()
config_id = data["id"]
assert config_id
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [url]
def test_add_update_crawl_config_custom_behaviors_git_url(
crawler_auth_headers, default_org_id, sample_crawl_data
):
git_url = "git+https://github.com/webrecorder/custom-behaviors"
git_url_with_params = (
"git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors"
)
# Create workflow and validate it looks like we expect
sample_crawl_data["config"]["customBehaviors"] = [git_url]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
data = r.json()
config_id = data["id"]
assert config_id
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [git_url]
# Try to update custom behaviors with invalid url, validate unchanged
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/",
headers=crawler_auth_headers,
json={
"config": {
"customBehaviors": [git_url, "not-a-url"],
}
},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_custom_behavior"
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [git_url]
# Update custom behaviors with valid url, validate changed
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/",
headers=crawler_auth_headers,
json={
"config": {
"customBehaviors": [git_url_with_params],
}
},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [git_url_with_params]
def test_validate_custom_behavior(crawler_auth_headers, default_org_id):
valid_url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js"
invalid_url_404 = "https://webrecorder.net/nonexistent/behavior.js"
doesnt_resolve_url = "https://nonexistenturl-for-testing-browsertrix.com"
malformed_url = "http"
git_url = "git+https://github.com/webrecorder/custom-behaviors"
invalid_git_url = "git+https://github.com/webrecorder/doesntexist"
private_git_url = "git+https://github.com/webrecorder/website"
git_url_with_params = (
"git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors"
)
git_url_invalid_branch = (
"git+https://github.com/webrecorder/custom-behaviors?branch=doesntexist"
)
# Success
for url in (valid_url, git_url, git_url_with_params):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": url},
)
assert r.status_code == 200
assert r.json()["success"]
# Behavior 404s
for url in (invalid_url_404, doesnt_resolve_url):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": url},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_not_found"
# Malformed url
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": malformed_url},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_custom_behavior"
# Git repo doesn't exist
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": invalid_git_url},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_not_found"
# Git repo isn't public
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": private_git_url},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_not_found"
# Git branch doesn't exist
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": git_url_invalid_branch},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_branch_not_found"