browsertrix/backend/test/test_crawlconfigs.py
Tessa Walsh 993f82a49b
Add last crawl's stats object to CrawlConfigOut (#2714)
Fixes #2709 

Will allow us to display information about page counts (found, done) in
the workflow list.
2025-07-23 20:10:46 -07:00

998 lines
30 KiB
Python

import time
import requests
from .conftest import API_PREFIX
cid = None
cid_single_page = None
UPDATED_NAME = "Updated name"
UPDATED_DESCRIPTION = "Updated description"
UPDATED_TAGS = ["tag3", "tag4"]
_coll_id = None
_admin_crawl_cid = None
_seed_file_id = None
def test_crawl_config_usernames(
crawler_auth_headers, default_org_id, crawler_config_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["createdByName"]
assert data["modifiedByName"]
assert data["lastStartedByName"]
created = data["created"]
assert created
assert created.endswith("Z")
modified = data["modified"]
assert modified
assert modified.endswith("Z")
def test_add_crawl_config(crawler_auth_headers, default_org_id, sample_crawl_data):
# Create crawl config
sample_crawl_data["schedule"] = "0 0 * * *"
sample_crawl_data["profileid"] = ""
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
data = r.json()
global cid
cid = data["id"]
def test_verify_default_browser_windows(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("scale") is None
assert data["browserWindows"] == 2
def test_add_crawl_config_single_page(
crawler_auth_headers, default_org_id, sample_crawl_data
):
# Create crawl config
sample_crawl_data["config"]["limit"] = 1
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
data = r.json()
global cid_single_page
cid_single_page = data["id"]
def test_verify_default_browser_windows_single_page(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid_single_page}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("scale") is None
assert data["browserWindows"] == 1
def test_custom_browser_windows(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["browserWindows"] = 4
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
workflow_id = r.json()["id"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{workflow_id}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("scale") is None
assert data["browserWindows"] == 4
def test_custom_scale(crawler_auth_headers, default_org_id, sample_crawl_data):
sample_crawl_data["scale"] = 3
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
workflow_id = r.json()["id"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{workflow_id}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("scale") is None
assert data["browserWindows"] == 6
def test_update_name_only(crawler_auth_headers, default_org_id):
# update name only
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"name": "updated name 1"},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
assert data["metadata_changed"] == True
assert data["settings_changed"] == False
def test_update_desription_only(crawler_auth_headers, default_org_id):
# update description only
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"description": "updated description"},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
assert data["metadata_changed"] == True
assert data["settings_changed"] == False
def test_update_crawl_config_metadata(crawler_auth_headers, default_org_id):
# Make a new collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [],
"name": "autoAddUpdate",
},
)
assert r.status_code == 200
data = r.json()
global _coll_id
_coll_id = data["id"]
assert _coll_id
# Update crawl config
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"name": UPDATED_NAME,
"description": UPDATED_DESCRIPTION,
"tags": UPDATED_TAGS,
"autoAddCollections": [_coll_id],
},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
assert data["metadata_changed"] == True
assert data["settings_changed"] == False
def test_verify_update(crawler_auth_headers, default_org_id):
# Verify update was successful
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["name"] == UPDATED_NAME
assert data["description"] == UPDATED_DESCRIPTION
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
assert data["autoAddCollections"] == [_coll_id]
assert data["firstSeed"] == "https://example.com/"
def test_update_config_invalid_format(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"config": {
"seeds": ["https://example.com/"],
"scopeType": "domain",
"limit": 10,
}
},
)
assert r.status_code == 422
def test_update_config_invalid_exclude_regex(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"exclude": "["}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"exclude": ["abc.*", "["]}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
def test_update_config_invalid_link_selector(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"selectLinks": []}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_link_selector"
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"selectLinks": ["a[href]->href", "->href"]}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_link_selector"
def test_update_config_invalid_lang(
crawler_auth_headers, default_org_id, sample_crawl_data
):
for invalid_code in ("f", "fra", "french"):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"lang": invalid_code}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_lang"
def test_verify_default_select_links(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["config"]["selectLinks"] == ["a[href]->href"]
def test_verify_default_click_selector(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["config"]["clickSelector"] == "a"
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"config": {
"seeds": [{"url": "https://example.com/"}],
"scopeType": "domain",
"selectLinks": ["a[href]->href", "script[src]->src"],
"clickSelector": "button",
}
},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["config"]["scopeType"] == "domain"
assert data["config"]["selectLinks"] == ["a[href]->href", "script[src]->src"]
assert data["config"]["clickSelector"] == "button"
def test_update_config_no_changes(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"config": {
"seeds": [{"url": "https://example.com/"}],
"scopeType": "domain",
"selectLinks": ["a[href]->href", "script[src]->src"],
"clickSelector": "button",
}
},
)
assert r.status_code == 200
data = r.json()
assert data["settings_changed"] == False
assert data["metadata_changed"] == False
def test_update_crawl_timeout(crawler_auth_headers, default_org_id, sample_crawl_data):
# Verify that updating crawl timeout works
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"crawlTimeout": 60},
)
assert r.status_code == 200
data = r.json()
assert data["settings_changed"] == True
assert data["metadata_changed"] == False
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlTimeout"] == 60
def test_update_max_crawl_size(crawler_auth_headers, default_org_id, sample_crawl_data):
# Verify that updating crawl timeout works
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"maxCrawlSize": 4096},
)
assert r.status_code == 200
data = r.json()
assert data["settings_changed"] == True
assert data["metadata_changed"] == False
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["maxCrawlSize"] == 4096
def test_update_browser_windows(crawler_auth_headers, default_org_id):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"browserWindows": 1},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("scale") is None
assert data["browserWindows"] == 1
def test_update_scale(crawler_auth_headers, default_org_id):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"scale": 1},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("scale") is None
assert data["browserWindows"] == 2
def test_verify_delete_tags(crawler_auth_headers, default_org_id):
# Verify that deleting tags and name works as well
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"tags": [], "name": None},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert not data["name"]
assert data["tags"] == []
def test_verify_revs_history(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/revs",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 5
items = data["items"]
assert len(items) == 5
sorted_data = sorted(items, key=lambda revision: revision["rev"])
assert sorted_data[0]["config"]["scopeType"] == "prefix"
def test_workflow_total_size_and_last_crawl_stats(
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] > 0
items = data["items"]
for workflow in items:
assert workflow.get("config") is None
assert workflow["seedCount"]
assert workflow["firstSeed"]
last_crawl_id = workflow.get("lastCrawlId")
if last_crawl_id and last_crawl_id in (admin_crawl_id, crawler_crawl_id):
assert workflow["totalSize"] > 0
assert workflow["crawlCount"] > 0
assert workflow["crawlSuccessfulCount"] > 0
assert workflow["lastCrawlId"]
assert workflow["lastCrawlStartTime"]
assert workflow["lastStartedByName"]
assert workflow["lastCrawlTime"]
assert workflow["lastCrawlState"]
assert workflow["lastRun"]
assert workflow["lastCrawlSize"] > 0
stats = workflow["lastCrawlStats"]
assert stats["found"] > 0
assert stats["done"] > 0
assert stats["size"] > 0
if last_crawl_id == admin_crawl_id:
global _admin_crawl_cid
_admin_crawl_cid = workflow["id"]
assert _admin_crawl_cid
else:
assert workflow["totalSize"] == 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["totalSize"] > 0
assert data["crawlCount"] > 0
assert data["crawlSuccessfulCount"] > 0
assert data["lastCrawlId"]
assert data["lastCrawlStartTime"]
assert data["lastStartedByName"]
assert data["lastCrawlTime"]
assert data["lastCrawlState"]
assert data["lastRun"]
assert data["lastCrawlSize"] > 0
stats = data["lastCrawlStats"]
assert stats["found"] > 0
assert stats["done"] > 0
assert stats["size"] > 0
def test_incremental_workflow_total_size_and_last_crawl_stats(
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
):
# Get baseline values
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlCount"] == 1
assert data["crawlSuccessfulCount"] == 1
total_size = data["totalSize"]
last_crawl_id = data["lastCrawlId"]
last_crawl_started = data["lastCrawlStartTime"]
last_crawl_finished = data["lastCrawlTime"]
last_run = data["lastRun"]
last_stats = data["lastCrawlStats"]
# Run new crawl in this workflow
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/run",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawl_id = r.json()["started"]
# Wait for it to complete
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
if data["state"] == "complete":
break
time.sleep(5)
# Give time for stats to re-compute
time.sleep(10)
# Re-check stats
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlCount"] == 2
assert data["crawlSuccessfulCount"] == 2
assert data["totalSize"] > total_size
assert data["lastCrawlId"] == crawl_id
assert data["lastCrawlStartTime"] > last_crawl_started
assert data["lastCrawlTime"] > last_crawl_finished
assert data["lastRun"] > last_run
stats = data["lastCrawlStats"]
assert stats["found"] > 0
assert stats["done"] > 0
assert stats["size"] > 0
# Delete new crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=crawler_auth_headers,
json={"crawl_ids": [crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"] == 1
# Re-check stats
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlCount"] == 1
assert data["crawlSuccessfulCount"] == 1
assert data["totalSize"] == total_size
assert data["lastCrawlId"] == last_crawl_id
assert data["lastCrawlStartTime"] == last_crawl_started
assert data["lastCrawlTime"] == last_crawl_finished
assert data["lastRun"] == last_run
assert data["lastCrawlStats"] == last_stats
def test_get_config_seeds(crawler_auth_headers, default_org_id, url_list_config_id):
# Make sure seeds aren't included in the crawlconfig detail
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json().get("config").get("seeds") is None
# Test getting seeds from separate endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
EXPECTED_SEED_URLS = [
"https://webrecorder.net/",
"https://example.com/",
"https://specs.webrecorder.net/",
]
found_seed_urls = []
for item in data["items"]:
found_seed_urls.append(item["url"])
assert sorted(found_seed_urls) == sorted(EXPECTED_SEED_URLS)
# Test getting seeds with low page size
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds?pageSize=2",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert len(items) == 2
for item in items:
assert item["url"] in EXPECTED_SEED_URLS
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds?pageSize=2&page=2",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert len(items) == 1
assert items[0]["url"] in EXPECTED_SEED_URLS
def test_get_crawler_channels(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/crawler-channels",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawler_channels = r.json()["channels"]
assert crawler_channels
assert len(crawler_channels) == 2
for crawler_channel in crawler_channels:
assert crawler_channel["id"]
assert crawler_channel["image"]
def test_add_crawl_config_invalid_exclude_regex(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["config"]["exclude"] = "["
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
sample_crawl_data["config"]["exclude"] = ["abc.*", "["]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
def test_add_crawl_config_invalid_lang(
crawler_auth_headers, default_org_id, sample_crawl_data
):
for invalid_code in ("f", "fra", "french"):
sample_crawl_data["config"]["lang"] = invalid_code
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_lang"
def test_add_crawl_config_invalid_link_selectors(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["config"]["selectLinks"] = []
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_link_selector"
sample_crawl_data["config"]["selectLinks"] = ["a[href]->href", "->href"]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_link_selector"
def test_add_crawl_config_custom_behaviors_invalid_url(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["config"]["customBehaviors"] = ["http"]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_custom_behavior"
def test_add_crawl_config_custom_behaviors_valid_url(
crawler_auth_headers, default_org_id, sample_crawl_data
):
url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js"
sample_crawl_data["config"]["customBehaviors"] = [url]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
data = r.json()
config_id = data["id"]
assert config_id
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [url]
def test_add_update_crawl_config_custom_behaviors_git_url(
crawler_auth_headers, default_org_id, sample_crawl_data
):
git_url = "git+https://github.com/webrecorder/custom-behaviors"
git_url_with_params = (
"git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors"
)
# Create workflow and validate it looks like we expect
sample_crawl_data["config"]["customBehaviors"] = [git_url]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
data = r.json()
config_id = data["id"]
assert config_id
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [git_url]
# Try to update custom behaviors with invalid url, validate unchanged
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/",
headers=crawler_auth_headers,
json={
"config": {
"customBehaviors": [git_url, "not-a-url"],
}
},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_custom_behavior"
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [git_url]
# Update custom behaviors with valid url, validate changed
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/",
headers=crawler_auth_headers,
json={
"config": {
"customBehaviors": [git_url_with_params],
}
},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [git_url_with_params]
def test_validate_custom_behavior(crawler_auth_headers, default_org_id):
valid_url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js"
invalid_url_404 = "https://webrecorder.net/nonexistent/behavior.js"
doesnt_resolve_url = "https://nonexistenturl-for-testing-browsertrix.com"
malformed_url = "http"
git_url = "git+https://github.com/webrecorder/custom-behaviors"
invalid_git_url = "git+https://github.com/webrecorder/doesntexist"
private_git_url = "git+https://github.com/webrecorder/website"
git_url_with_params = (
"git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors"
)
git_url_invalid_branch = (
"git+https://github.com/webrecorder/custom-behaviors?branch=doesntexist"
)
# Success
for url in (valid_url, git_url, git_url_with_params):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": url},
)
assert r.status_code == 200
assert r.json()["success"]
# Behavior 404s
for url in (invalid_url_404, doesnt_resolve_url):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": url},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_not_found"
# Malformed url
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": malformed_url},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_custom_behavior"
# Git repo doesn't exist
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": invalid_git_url},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_not_found"
# Git repo isn't public
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": private_git_url},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_not_found"
# Git branch doesn't exist
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": git_url_invalid_branch},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_branch_not_found"
def test_add_crawl_config_with_seed_file(
crawler_auth_headers, default_org_id, seed_file_id, seed_file_config_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{seed_file_config_id}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == seed_file_config_id
assert data["name"] == "Seed File Test Crawl"
assert data["config"]["seedFileId"] == seed_file_id
assert data["config"]["seeds"] is None
def test_delete_in_use_seed_file(
crawler_auth_headers, default_org_id, seed_file_id, seed_file_config_id
):
# Attempt to delete in-use seed file, verify we get 400 response
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/files/{seed_file_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "seed_file_in_use"
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/files/{seed_file_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["id"] == seed_file_id