browsertrix/backend/test/test_crawlconfigs.py
Tessa Walsh f7ba712646 Add seed file support to Browsertrix backend (#2710)
Fixes #2673 

Changes in this PR:

- Adds a new `file_uploads.py` module and corresponding `/files` API
prefix with methods/endpoints for uploading, GETing, and deleting seed
files (can be extended to other types of files moving forward)
- Seed files are supported via `CrawlConfig.config.seedFileId` on POST
and PATCH endpoints. This seedFileId is replaced by a presigned url when
passed to the crawler by the operator
- Seed files are read when first uploaded to calculate `firstSeed` and
`seedCount` and store them in the database, and this is copied into the
workflow and crawl documents when they are created.
- Logic is added to store `firstSeed` and `seedCount` for other
workflows as well, and a migration added to backfill data, to maintain
consistency and fix some of the pymongo aggregations that previously
assumed all workflows would have at least one `Seed` object in
`CrawlConfig.seeds`
- Seed file and thumbnail storage stats are added to org stats
- Seed file and thumbnail uploads first check that the org's storage
quota has not been exceeded and return a 400 if so
- A cron background job (run weekly each Sunday at midnight by default,
but configurable) is added to look for seed files at least x minutes old
(1440 minutes, or 1 day, by default, but configurable) that are not in
use in any workflows, and to delete them when they are found. The
backend pods will ensure this k8s batch job exists when starting up and
create it if it does not already exist. A database entry for each run of
the job is created in the operator on job completion so that it'll
appear in the `/jobs` API endpoints, but retrying of this type of
regularly scheduled background job is not supported as we don't want to
accidentally create multiple competing scheduled jobs.
- Adds a `min_seed_file_crawler_image` value to the Helm chart that is
checked before creating a crawl from a workflow if set. If a workflow
cannot be run, return the detail of the exception in
`CrawlConfigAddedResponse.errorDetail` so that we can display the reason
in the frontend
- Add SeedFile model from base UserFile (former ImageFIle), ensure all APIs
returning uploaded files return an absolute pre-signed URL (either with external origin or internal service origin)

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2025-07-22 19:11:02 -07:00

982 lines
30 KiB
Python

import time
import requests
from .conftest import API_PREFIX
cid = None
cid_single_page = None
UPDATED_NAME = "Updated name"
UPDATED_DESCRIPTION = "Updated description"
UPDATED_TAGS = ["tag3", "tag4"]
_coll_id = None
_admin_crawl_cid = None
_seed_file_id = None
def test_crawl_config_usernames(
crawler_auth_headers, default_org_id, crawler_config_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["createdByName"]
assert data["modifiedByName"]
assert data["lastStartedByName"]
created = data["created"]
assert created
assert created.endswith("Z")
modified = data["modified"]
assert modified
assert modified.endswith("Z")
def test_add_crawl_config(crawler_auth_headers, default_org_id, sample_crawl_data):
# Create crawl config
sample_crawl_data["schedule"] = "0 0 * * *"
sample_crawl_data["profileid"] = ""
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
data = r.json()
global cid
cid = data["id"]
def test_verify_default_browser_windows(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("scale") is None
assert data["browserWindows"] == 2
def test_add_crawl_config_single_page(
crawler_auth_headers, default_org_id, sample_crawl_data
):
# Create crawl config
sample_crawl_data["config"]["limit"] = 1
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
data = r.json()
global cid_single_page
cid_single_page = data["id"]
def test_verify_default_browser_windows_single_page(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid_single_page}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("scale") is None
assert data["browserWindows"] == 1
def test_custom_browser_windows(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["browserWindows"] = 4
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
workflow_id = r.json()["id"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{workflow_id}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("scale") is None
assert data["browserWindows"] == 4
def test_custom_scale(crawler_auth_headers, default_org_id, sample_crawl_data):
sample_crawl_data["scale"] = 3
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
workflow_id = r.json()["id"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{workflow_id}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("scale") is None
assert data["browserWindows"] == 6
def test_update_name_only(crawler_auth_headers, default_org_id):
# update name only
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"name": "updated name 1"},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
assert data["metadata_changed"] == True
assert data["settings_changed"] == False
def test_update_desription_only(crawler_auth_headers, default_org_id):
# update description only
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"description": "updated description"},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
assert data["metadata_changed"] == True
assert data["settings_changed"] == False
def test_update_crawl_config_metadata(crawler_auth_headers, default_org_id):
# Make a new collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={
"crawlIds": [],
"name": "autoAddUpdate",
},
)
assert r.status_code == 200
data = r.json()
global _coll_id
_coll_id = data["id"]
assert _coll_id
# Update crawl config
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"name": UPDATED_NAME,
"description": UPDATED_DESCRIPTION,
"tags": UPDATED_TAGS,
"autoAddCollections": [_coll_id],
},
)
assert r.status_code == 200
data = r.json()
assert data["updated"]
assert data["metadata_changed"] == True
assert data["settings_changed"] == False
def test_verify_update(crawler_auth_headers, default_org_id):
# Verify update was successful
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["name"] == UPDATED_NAME
assert data["description"] == UPDATED_DESCRIPTION
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
assert data["autoAddCollections"] == [_coll_id]
assert data["firstSeed"] == "https://example.com/"
def test_update_config_invalid_format(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"config": {
"seeds": ["https://example.com/"],
"scopeType": "domain",
"limit": 10,
}
},
)
assert r.status_code == 422
def test_update_config_invalid_exclude_regex(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"exclude": "["}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"exclude": ["abc.*", "["]}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
def test_update_config_invalid_link_selector(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"selectLinks": []}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_link_selector"
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"selectLinks": ["a[href]->href", "->href"]}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_link_selector"
def test_update_config_invalid_lang(
crawler_auth_headers, default_org_id, sample_crawl_data
):
for invalid_code in ("f", "fra", "french"):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"lang": invalid_code}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_lang"
def test_verify_default_select_links(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["config"]["selectLinks"] == ["a[href]->href"]
def test_verify_default_click_selector(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["config"]["clickSelector"] == "a"
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"config": {
"seeds": [{"url": "https://example.com/"}],
"scopeType": "domain",
"selectLinks": ["a[href]->href", "script[src]->src"],
"clickSelector": "button",
}
},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["config"]["scopeType"] == "domain"
assert data["config"]["selectLinks"] == ["a[href]->href", "script[src]->src"]
assert data["config"]["clickSelector"] == "button"
def test_update_config_no_changes(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"config": {
"seeds": [{"url": "https://example.com/"}],
"scopeType": "domain",
"selectLinks": ["a[href]->href", "script[src]->src"],
"clickSelector": "button",
}
},
)
assert r.status_code == 200
data = r.json()
assert data["settings_changed"] == False
assert data["metadata_changed"] == False
def test_update_crawl_timeout(crawler_auth_headers, default_org_id, sample_crawl_data):
# Verify that updating crawl timeout works
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"crawlTimeout": 60},
)
assert r.status_code == 200
data = r.json()
assert data["settings_changed"] == True
assert data["metadata_changed"] == False
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlTimeout"] == 60
def test_update_max_crawl_size(crawler_auth_headers, default_org_id, sample_crawl_data):
# Verify that updating crawl timeout works
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"maxCrawlSize": 4096},
)
assert r.status_code == 200
data = r.json()
assert data["settings_changed"] == True
assert data["metadata_changed"] == False
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["maxCrawlSize"] == 4096
def test_update_browser_windows(crawler_auth_headers, default_org_id):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"browserWindows": 1},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("scale") is None
assert data["browserWindows"] == 1
def test_update_scale(crawler_auth_headers, default_org_id):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"scale": 1},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("scale") is None
assert data["browserWindows"] == 2
def test_verify_delete_tags(crawler_auth_headers, default_org_id):
# Verify that deleting tags and name works as well
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"tags": [], "name": None},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert not data["name"]
assert data["tags"] == []
def test_verify_revs_history(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/revs",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 5
items = data["items"]
assert len(items) == 5
sorted_data = sorted(items, key=lambda revision: revision["rev"])
assert sorted_data[0]["config"]["scopeType"] == "prefix"
def test_workflow_total_size_and_last_crawl_stats(
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] > 0
items = data["items"]
for workflow in items:
assert workflow.get("config") is None
assert workflow["seedCount"]
assert workflow["firstSeed"]
last_crawl_id = workflow.get("lastCrawlId")
if last_crawl_id and last_crawl_id in (admin_crawl_id, crawler_crawl_id):
assert workflow["totalSize"] > 0
assert workflow["crawlCount"] > 0
assert workflow["crawlSuccessfulCount"] > 0
assert workflow["lastCrawlId"]
assert workflow["lastCrawlStartTime"]
assert workflow["lastStartedByName"]
assert workflow["lastCrawlTime"]
assert workflow["lastCrawlState"]
assert workflow["lastRun"]
assert workflow["lastCrawlSize"] > 0
if last_crawl_id == admin_crawl_id:
global _admin_crawl_cid
_admin_crawl_cid = workflow["id"]
assert _admin_crawl_cid
else:
assert workflow["totalSize"] == 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["totalSize"] > 0
assert data["crawlCount"] > 0
assert data["crawlSuccessfulCount"] > 0
assert data["lastCrawlId"]
assert data["lastCrawlStartTime"]
assert data["lastStartedByName"]
assert data["lastCrawlTime"]
assert data["lastCrawlState"]
assert data["lastRun"]
assert data["lastCrawlSize"] > 0
def test_incremental_workflow_total_size_and_last_crawl_stats(
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
):
# Get baseline values
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlCount"] == 1
assert data["crawlSuccessfulCount"] == 1
total_size = data["totalSize"]
last_crawl_id = data["lastCrawlId"]
last_crawl_started = data["lastCrawlStartTime"]
last_crawl_finished = data["lastCrawlTime"]
last_run = data["lastRun"]
# Run new crawl in this workflow
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/run",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawl_id = r.json()["started"]
# Wait for it to complete
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
if data["state"] == "complete":
break
time.sleep(5)
# Give time for stats to re-compute
time.sleep(10)
# Re-check stats
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlCount"] == 2
assert data["crawlSuccessfulCount"] == 2
assert data["totalSize"] > total_size
assert data["lastCrawlId"] == crawl_id
assert data["lastCrawlStartTime"] > last_crawl_started
assert data["lastCrawlTime"] > last_crawl_finished
assert data["lastRun"] > last_run
# Delete new crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=crawler_auth_headers,
json={"crawl_ids": [crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"] == 1
# Re-check stats
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlCount"] == 1
assert data["crawlSuccessfulCount"] == 1
assert data["totalSize"] == total_size
assert data["lastCrawlId"] == last_crawl_id
assert data["lastCrawlStartTime"] == last_crawl_started
assert data["lastCrawlTime"] == last_crawl_finished
assert data["lastRun"] == last_run
def test_get_config_seeds(crawler_auth_headers, default_org_id, url_list_config_id):
# Make sure seeds aren't included in the crawlconfig detail
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json().get("config").get("seeds") is None
# Test getting seeds from separate endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
EXPECTED_SEED_URLS = [
"https://webrecorder.net/",
"https://example.com/",
"https://specs.webrecorder.net/",
]
found_seed_urls = []
for item in data["items"]:
found_seed_urls.append(item["url"])
assert sorted(found_seed_urls) == sorted(EXPECTED_SEED_URLS)
# Test getting seeds with low page size
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds?pageSize=2",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert len(items) == 2
for item in items:
assert item["url"] in EXPECTED_SEED_URLS
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds?pageSize=2&page=2",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 3
items = data["items"]
assert len(items) == 1
assert items[0]["url"] in EXPECTED_SEED_URLS
def test_get_crawler_channels(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/crawler-channels",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawler_channels = r.json()["channels"]
assert crawler_channels
assert len(crawler_channels) == 2
for crawler_channel in crawler_channels:
assert crawler_channel["id"]
assert crawler_channel["image"]
def test_add_crawl_config_invalid_exclude_regex(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["config"]["exclude"] = "["
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
sample_crawl_data["config"]["exclude"] = ["abc.*", "["]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
def test_add_crawl_config_invalid_lang(
crawler_auth_headers, default_org_id, sample_crawl_data
):
for invalid_code in ("f", "fra", "french"):
sample_crawl_data["config"]["lang"] = invalid_code
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_lang"
def test_add_crawl_config_invalid_link_selectors(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["config"]["selectLinks"] = []
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_link_selector"
sample_crawl_data["config"]["selectLinks"] = ["a[href]->href", "->href"]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_link_selector"
def test_add_crawl_config_custom_behaviors_invalid_url(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["config"]["customBehaviors"] = ["http"]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_custom_behavior"
def test_add_crawl_config_custom_behaviors_valid_url(
crawler_auth_headers, default_org_id, sample_crawl_data
):
url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js"
sample_crawl_data["config"]["customBehaviors"] = [url]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
data = r.json()
config_id = data["id"]
assert config_id
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [url]
def test_add_update_crawl_config_custom_behaviors_git_url(
crawler_auth_headers, default_org_id, sample_crawl_data
):
git_url = "git+https://github.com/webrecorder/custom-behaviors"
git_url_with_params = (
"git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors"
)
# Create workflow and validate it looks like we expect
sample_crawl_data["config"]["customBehaviors"] = [git_url]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
data = r.json()
config_id = data["id"]
assert config_id
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [git_url]
# Try to update custom behaviors with invalid url, validate unchanged
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/",
headers=crawler_auth_headers,
json={
"config": {
"customBehaviors": [git_url, "not-a-url"],
}
},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_custom_behavior"
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [git_url]
# Update custom behaviors with valid url, validate changed
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/",
headers=crawler_auth_headers,
json={
"config": {
"customBehaviors": [git_url_with_params],
}
},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [git_url_with_params]
def test_validate_custom_behavior(crawler_auth_headers, default_org_id):
valid_url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js"
invalid_url_404 = "https://webrecorder.net/nonexistent/behavior.js"
doesnt_resolve_url = "https://nonexistenturl-for-testing-browsertrix.com"
malformed_url = "http"
git_url = "git+https://github.com/webrecorder/custom-behaviors"
invalid_git_url = "git+https://github.com/webrecorder/doesntexist"
private_git_url = "git+https://github.com/webrecorder/website"
git_url_with_params = (
"git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors"
)
git_url_invalid_branch = (
"git+https://github.com/webrecorder/custom-behaviors?branch=doesntexist"
)
# Success
for url in (valid_url, git_url, git_url_with_params):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": url},
)
assert r.status_code == 200
assert r.json()["success"]
# Behavior 404s
for url in (invalid_url_404, doesnt_resolve_url):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": url},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_not_found"
# Malformed url
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": malformed_url},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_custom_behavior"
# Git repo doesn't exist
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": invalid_git_url},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_not_found"
# Git repo isn't public
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": private_git_url},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_not_found"
# Git branch doesn't exist
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": git_url_invalid_branch},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_branch_not_found"
def test_add_crawl_config_with_seed_file(
crawler_auth_headers, default_org_id, seed_file_id, seed_file_config_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{seed_file_config_id}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == seed_file_config_id
assert data["name"] == "Seed File Test Crawl"
assert data["config"]["seedFileId"] == seed_file_id
assert data["config"]["seeds"] is None
def test_delete_in_use_seed_file(
crawler_auth_headers, default_org_id, seed_file_id, seed_file_config_id
):
# Attempt to delete in-use seed file, verify we get 400 response
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/files/{seed_file_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "seed_file_in_use"
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/files/{seed_file_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["id"] == seed_file_id