Backend support for #2151 Adds support for specifying custom behaviors via a list of strings. When workflows are added or modified, minimal backend validation is done to ensure that all custom behavior URLs are valid URLs (after removing the git prefix and custom query arguments). A separate `POST /crawlconfigs/validate/custom-behavior` endpoint is also added, which can be used to validate a custom behavior URL. It performs the same syntax check as above and then: - For URL directly to behavior file, ensures URL resolves and returns a 2xx/3xx status code - For Git repositories, uses `git ls-remote` to ensure they exist (and that branch exists if specified) --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
735 lines
22 KiB
Python
735 lines
22 KiB
Python
import time
|
|
|
|
import requests
|
|
|
|
from .conftest import API_PREFIX
|
|
|
|
|
|
cid = None
|
|
UPDATED_NAME = "Updated name"
|
|
UPDATED_DESCRIPTION = "Updated description"
|
|
UPDATED_TAGS = ["tag3", "tag4"]
|
|
|
|
_coll_id = None
|
|
_admin_crawl_cid = None
|
|
|
|
|
|
def test_crawl_config_usernames(
|
|
crawler_auth_headers, default_org_id, crawler_config_id
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["createdByName"]
|
|
assert data["modifiedByName"]
|
|
assert data["lastStartedByName"]
|
|
|
|
created = data["created"]
|
|
assert created
|
|
assert created.endswith("Z")
|
|
|
|
modified = data["modified"]
|
|
assert modified
|
|
assert modified.endswith("Z")
|
|
|
|
|
|
def test_add_crawl_config(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
# Create crawl config
|
|
sample_crawl_data["schedule"] = "0 0 * * *"
|
|
sample_crawl_data["profileid"] = ""
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
global cid
|
|
cid = data["id"]
|
|
|
|
|
|
def test_update_name_only(crawler_auth_headers, default_org_id):
|
|
# update name only
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"name": "updated name 1"},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["updated"]
|
|
assert data["metadata_changed"] == True
|
|
assert data["settings_changed"] == False
|
|
|
|
|
|
def test_update_desription_only(crawler_auth_headers, default_org_id):
|
|
# update description only
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"description": "updated description"},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["updated"]
|
|
assert data["metadata_changed"] == True
|
|
assert data["settings_changed"] == False
|
|
|
|
|
|
def test_update_crawl_config_metadata(crawler_auth_headers, default_org_id):
|
|
# Make a new collection
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"crawlIds": [],
|
|
"name": "autoAddUpdate",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
global _coll_id
|
|
_coll_id = data["id"]
|
|
assert _coll_id
|
|
|
|
# Update crawl config
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"name": UPDATED_NAME,
|
|
"description": UPDATED_DESCRIPTION,
|
|
"tags": UPDATED_TAGS,
|
|
"autoAddCollections": [_coll_id],
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["updated"]
|
|
assert data["metadata_changed"] == True
|
|
assert data["settings_changed"] == False
|
|
|
|
|
|
def test_verify_update(crawler_auth_headers, default_org_id):
|
|
# Verify update was successful
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["name"] == UPDATED_NAME
|
|
assert data["description"] == UPDATED_DESCRIPTION
|
|
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
|
|
assert data["autoAddCollections"] == [_coll_id]
|
|
assert data["firstSeed"] == "https://example.com/"
|
|
|
|
|
|
def test_update_config_invalid_format(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"config": {
|
|
"seeds": ["https://example.com/"],
|
|
"scopeType": "domain",
|
|
"limit": 10,
|
|
}
|
|
},
|
|
)
|
|
|
|
assert r.status_code == 422
|
|
|
|
|
|
def test_update_config_invalid_exclude_regex(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"config": {"exclude": "["}},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_regex"
|
|
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"config": {"exclude": ["abc.*", "["]}},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_regex"
|
|
|
|
|
|
def test_verify_default_select_links(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["config"]["selectLinks"] == ["a[href]->href"]
|
|
|
|
|
|
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"config": {
|
|
"seeds": [{"url": "https://example.com/"}],
|
|
"scopeType": "domain",
|
|
"selectLinks": ["a[href]->href", "script[src]->src"],
|
|
}
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
|
|
assert data["config"]["scopeType"] == "domain"
|
|
assert data["config"]["selectLinks"] == ["a[href]->href", "script[src]->src"]
|
|
|
|
|
|
def test_update_config_no_changes(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"config": {
|
|
"seeds": [{"url": "https://example.com/"}],
|
|
"scopeType": "domain",
|
|
"selectLinks": ["a[href]->href", "script[src]->src"],
|
|
}
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["settings_changed"] == False
|
|
assert data["metadata_changed"] == False
|
|
|
|
|
|
def test_update_crawl_timeout(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
# Verify that updating crawl timeout works
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"crawlTimeout": 60},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["settings_changed"] == True
|
|
assert data["metadata_changed"] == False
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
|
|
assert data["crawlTimeout"] == 60
|
|
|
|
|
|
def test_update_max_crawl_size(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
# Verify that updating crawl timeout works
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"maxCrawlSize": 4096},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["settings_changed"] == True
|
|
assert data["metadata_changed"] == False
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
|
|
assert data["maxCrawlSize"] == 4096
|
|
|
|
|
|
def test_verify_delete_tags(crawler_auth_headers, default_org_id):
|
|
# Verify that deleting tags and name works as well
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"tags": [], "name": None},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert not data["name"]
|
|
assert data["tags"] == []
|
|
|
|
|
|
def test_verify_revs_history(crawler_auth_headers, default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/revs",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
items = data["items"]
|
|
assert len(items) == 3
|
|
sorted_data = sorted(items, key=lambda revision: revision["rev"])
|
|
assert sorted_data[0]["config"]["scopeType"] == "prefix"
|
|
|
|
|
|
def test_workflow_total_size_and_last_crawl_stats(
|
|
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] > 0
|
|
items = data["items"]
|
|
for workflow in items:
|
|
assert workflow.get("config") is None
|
|
assert workflow["seedCount"]
|
|
assert workflow["firstSeed"]
|
|
|
|
last_crawl_id = workflow.get("lastCrawlId")
|
|
if last_crawl_id and last_crawl_id in (admin_crawl_id, crawler_crawl_id):
|
|
assert workflow["totalSize"] > 0
|
|
assert workflow["crawlCount"] > 0
|
|
assert workflow["crawlSuccessfulCount"] > 0
|
|
|
|
assert workflow["lastCrawlId"]
|
|
assert workflow["lastCrawlStartTime"]
|
|
assert workflow["lastStartedByName"]
|
|
assert workflow["lastCrawlTime"]
|
|
assert workflow["lastCrawlState"]
|
|
assert workflow["lastRun"]
|
|
assert workflow["lastCrawlSize"] > 0
|
|
|
|
if last_crawl_id == admin_crawl_id:
|
|
global _admin_crawl_cid
|
|
_admin_crawl_cid = workflow["id"]
|
|
assert _admin_crawl_cid
|
|
else:
|
|
assert workflow["totalSize"] == 0
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["totalSize"] > 0
|
|
assert data["crawlCount"] > 0
|
|
assert data["crawlSuccessfulCount"] > 0
|
|
|
|
assert data["lastCrawlId"]
|
|
assert data["lastCrawlStartTime"]
|
|
assert data["lastStartedByName"]
|
|
assert data["lastCrawlTime"]
|
|
assert data["lastCrawlState"]
|
|
assert data["lastRun"]
|
|
assert data["lastCrawlSize"] > 0
|
|
|
|
|
|
def test_incremental_workflow_total_size_and_last_crawl_stats(
|
|
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
|
|
):
|
|
# Get baseline values
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["crawlCount"] == 1
|
|
assert data["crawlSuccessfulCount"] == 1
|
|
total_size = data["totalSize"]
|
|
last_crawl_id = data["lastCrawlId"]
|
|
last_crawl_started = data["lastCrawlStartTime"]
|
|
last_crawl_finished = data["lastCrawlTime"]
|
|
last_run = data["lastRun"]
|
|
|
|
# Run new crawl in this workflow
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/run",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
crawl_id = r.json()["started"]
|
|
|
|
# Wait for it to complete
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "complete":
|
|
break
|
|
time.sleep(5)
|
|
|
|
# Give time for stats to re-compute
|
|
time.sleep(10)
|
|
|
|
# Re-check stats
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["crawlCount"] == 2
|
|
assert data["crawlSuccessfulCount"] == 2
|
|
assert data["totalSize"] > total_size
|
|
assert data["lastCrawlId"] == crawl_id
|
|
assert data["lastCrawlStartTime"] > last_crawl_started
|
|
assert data["lastCrawlTime"] > last_crawl_finished
|
|
assert data["lastRun"] > last_run
|
|
|
|
# Delete new crawl
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
|
headers=crawler_auth_headers,
|
|
json={"crawl_ids": [crawl_id]},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["deleted"] == 1
|
|
|
|
# Re-check stats
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["crawlCount"] == 1
|
|
assert data["crawlSuccessfulCount"] == 1
|
|
assert data["totalSize"] == total_size
|
|
assert data["lastCrawlId"] == last_crawl_id
|
|
assert data["lastCrawlStartTime"] == last_crawl_started
|
|
assert data["lastCrawlTime"] == last_crawl_finished
|
|
assert data["lastRun"] == last_run
|
|
|
|
|
|
def test_get_config_seeds(crawler_auth_headers, default_org_id, url_list_config_id):
|
|
# Make sure seeds aren't included in the crawlconfig detail
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json().get("config").get("seeds") is None
|
|
|
|
# Test getting seeds from separate endpoint
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
EXPECTED_SEED_URLS = [
|
|
"https://webrecorder.net/",
|
|
"https://example.com/",
|
|
"https://specs.webrecorder.net/",
|
|
]
|
|
found_seed_urls = []
|
|
|
|
for item in data["items"]:
|
|
found_seed_urls.append(item["url"])
|
|
|
|
assert sorted(found_seed_urls) == sorted(EXPECTED_SEED_URLS)
|
|
|
|
# Test getting seeds with low page size
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds?pageSize=2",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
items = data["items"]
|
|
assert len(items) == 2
|
|
for item in items:
|
|
assert item["url"] in EXPECTED_SEED_URLS
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds?pageSize=2&page=2",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
items = data["items"]
|
|
assert len(items) == 1
|
|
assert items[0]["url"] in EXPECTED_SEED_URLS
|
|
|
|
|
|
def test_get_crawler_channels(crawler_auth_headers, default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/crawler-channels",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
crawler_channels = r.json()["channels"]
|
|
assert crawler_channels
|
|
assert len(crawler_channels) == 2
|
|
for crawler_channel in crawler_channels:
|
|
assert crawler_channel["id"]
|
|
assert crawler_channel["image"]
|
|
|
|
|
|
def test_add_crawl_config_invalid_exclude_regex(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
sample_crawl_data["config"]["exclude"] = "["
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_regex"
|
|
|
|
sample_crawl_data["config"]["exclude"] = ["abc.*", "["]
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_regex"
|
|
|
|
|
|
def test_add_crawl_config_custom_behaviors_invalid_url(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
sample_crawl_data["config"]["customBehaviors"] = ["http"]
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_custom_behavior"
|
|
|
|
|
|
def test_add_crawl_config_custom_behaviors_valid_url(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js"
|
|
sample_crawl_data["config"]["customBehaviors"] = [url]
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
config_id = data["id"]
|
|
assert config_id
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == config_id
|
|
assert data["config"]["customBehaviors"] == [url]
|
|
|
|
|
|
def test_add_update_crawl_config_custom_behaviors_git_url(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
git_url = "git+https://github.com/webrecorder/custom-behaviors"
|
|
git_url_with_params = (
|
|
"git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors"
|
|
)
|
|
|
|
# Create workflow and validate it looks like we expect
|
|
sample_crawl_data["config"]["customBehaviors"] = [git_url]
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
config_id = data["id"]
|
|
assert config_id
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == config_id
|
|
assert data["config"]["customBehaviors"] == [git_url]
|
|
|
|
# Try to update custom behaviors with invalid url, validate unchanged
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"config": {
|
|
"customBehaviors": [git_url, "not-a-url"],
|
|
}
|
|
},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_custom_behavior"
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == config_id
|
|
assert data["config"]["customBehaviors"] == [git_url]
|
|
|
|
# Update custom behaviors with valid url, validate changed
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"config": {
|
|
"customBehaviors": [git_url_with_params],
|
|
}
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == config_id
|
|
assert data["config"]["customBehaviors"] == [git_url_with_params]
|
|
|
|
|
|
def test_validate_custom_behavior(crawler_auth_headers, default_org_id):
|
|
valid_url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js"
|
|
invalid_url_404 = "https://webrecorder.net/nonexistent/behavior.js"
|
|
doesnt_resolve_url = "https://nonexistenturl-for-testing-browsertrix.com"
|
|
malformed_url = "http"
|
|
|
|
git_url = "git+https://github.com/webrecorder/custom-behaviors"
|
|
invalid_git_url = "git+https://github.com/webrecorder/doesntexist"
|
|
private_git_url = "git+https://github.com/webrecorder/website"
|
|
|
|
git_url_with_params = (
|
|
"git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors"
|
|
)
|
|
git_url_invalid_branch = (
|
|
"git+https://github.com/webrecorder/custom-behaviors?branch=doesntexist"
|
|
)
|
|
|
|
# Success
|
|
for url in (valid_url, git_url, git_url_with_params):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
|
headers=crawler_auth_headers,
|
|
json={"customBehavior": url},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["success"]
|
|
|
|
# Behavior 404s
|
|
for url in (invalid_url_404, doesnt_resolve_url):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
|
headers=crawler_auth_headers,
|
|
json={"customBehavior": url},
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "custom_behavior_not_found"
|
|
|
|
# Malformed url
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
|
headers=crawler_auth_headers,
|
|
json={"customBehavior": malformed_url},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_custom_behavior"
|
|
|
|
# Git repo doesn't exist
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
|
headers=crawler_auth_headers,
|
|
json={"customBehavior": invalid_git_url},
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "custom_behavior_not_found"
|
|
|
|
# Git repo isn't public
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
|
headers=crawler_auth_headers,
|
|
json={"customBehavior": private_git_url},
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "custom_behavior_not_found"
|
|
|
|
# Git branch doesn't exist
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
|
headers=crawler_auth_headers,
|
|
json={"customBehavior": git_url_invalid_branch},
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "custom_behavior_branch_not_found"
|