Fixes #2673 Changes in this PR: - Adds a new `file_uploads.py` module and corresponding `/files` API prefix with methods/endpoints for uploading, GETing, and deleting seed files (can be extended to other types of files moving forward) - Seed files are supported via `CrawlConfig.config.seedFileId` on POST and PATCH endpoints. This seedFileId is replaced by a presigned url when passed to the crawler by the operator - Seed files are read when first uploaded to calculate `firstSeed` and `seedCount` and store them in the database, and this is copied into the workflow and crawl documents when they are created. - Logic is added to store `firstSeed` and `seedCount` for other workflows as well, and a migration added to backfill data, to maintain consistency and fix some of the pymongo aggregations that previously assumed all workflows would have at least one `Seed` object in `CrawlConfig.seeds` - Seed file and thumbnail storage stats are added to org stats - Seed file and thumbnail uploads first check that the org's storage quota has not been exceeded and return a 400 if so - A cron background job (run weekly each Sunday at midnight by default, but configurable) is added to look for seed files at least x minutes old (1440 minutes, or 1 day, by default, but configurable) that are not in use in any workflows, and to delete them when they are found. The backend pods will ensure this k8s batch job exists when starting up and create it if it does not already exist. A database entry for each run of the job is created in the operator on job completion so that it'll appear in the `/jobs` API endpoints, but retrying of this type of regularly scheduled background job is not supported as we don't want to accidentally create multiple competing scheduled jobs. - Adds a `min_seed_file_crawler_image` value to the Helm chart that is checked before creating a crawl from a workflow if set. If a workflow cannot be run, return the detail of the exception in `CrawlConfigAddedResponse.errorDetail` so that we can display the reason in the frontend - Add SeedFile model from base UserFile (former ImageFIle), ensure all APIs returning uploaded files return an absolute pre-signed URL (either with external origin or internal service origin) --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
982 lines
30 KiB
Python
982 lines
30 KiB
Python
import time
|
|
|
|
import requests
|
|
|
|
from .conftest import API_PREFIX
|
|
|
|
cid = None
|
|
cid_single_page = None
|
|
UPDATED_NAME = "Updated name"
|
|
UPDATED_DESCRIPTION = "Updated description"
|
|
UPDATED_TAGS = ["tag3", "tag4"]
|
|
|
|
_coll_id = None
|
|
_admin_crawl_cid = None
|
|
|
|
_seed_file_id = None
|
|
|
|
|
|
def test_crawl_config_usernames(
|
|
crawler_auth_headers, default_org_id, crawler_config_id
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["createdByName"]
|
|
assert data["modifiedByName"]
|
|
assert data["lastStartedByName"]
|
|
|
|
created = data["created"]
|
|
assert created
|
|
assert created.endswith("Z")
|
|
|
|
modified = data["modified"]
|
|
assert modified
|
|
assert modified.endswith("Z")
|
|
|
|
|
|
def test_add_crawl_config(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
# Create crawl config
|
|
sample_crawl_data["schedule"] = "0 0 * * *"
|
|
sample_crawl_data["profileid"] = ""
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
global cid
|
|
cid = data["id"]
|
|
|
|
|
|
def test_verify_default_browser_windows(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data.get("scale") is None
|
|
assert data["browserWindows"] == 2
|
|
|
|
|
|
def test_add_crawl_config_single_page(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
# Create crawl config
|
|
sample_crawl_data["config"]["limit"] = 1
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
global cid_single_page
|
|
cid_single_page = data["id"]
|
|
|
|
|
|
def test_verify_default_browser_windows_single_page(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid_single_page}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data.get("scale") is None
|
|
assert data["browserWindows"] == 1
|
|
|
|
|
|
def test_custom_browser_windows(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
sample_crawl_data["browserWindows"] = 4
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 200
|
|
workflow_id = r.json()["id"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{workflow_id}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data.get("scale") is None
|
|
assert data["browserWindows"] == 4
|
|
|
|
|
|
def test_custom_scale(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
sample_crawl_data["scale"] = 3
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 200
|
|
workflow_id = r.json()["id"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{workflow_id}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data.get("scale") is None
|
|
assert data["browserWindows"] == 6
|
|
|
|
|
|
def test_update_name_only(crawler_auth_headers, default_org_id):
|
|
# update name only
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"name": "updated name 1"},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["updated"]
|
|
assert data["metadata_changed"] == True
|
|
assert data["settings_changed"] == False
|
|
|
|
|
|
def test_update_desription_only(crawler_auth_headers, default_org_id):
|
|
# update description only
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"description": "updated description"},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["updated"]
|
|
assert data["metadata_changed"] == True
|
|
assert data["settings_changed"] == False
|
|
|
|
|
|
def test_update_crawl_config_metadata(crawler_auth_headers, default_org_id):
|
|
# Make a new collection
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"crawlIds": [],
|
|
"name": "autoAddUpdate",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
global _coll_id
|
|
_coll_id = data["id"]
|
|
assert _coll_id
|
|
|
|
# Update crawl config
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"name": UPDATED_NAME,
|
|
"description": UPDATED_DESCRIPTION,
|
|
"tags": UPDATED_TAGS,
|
|
"autoAddCollections": [_coll_id],
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["updated"]
|
|
assert data["metadata_changed"] == True
|
|
assert data["settings_changed"] == False
|
|
|
|
|
|
def test_verify_update(crawler_auth_headers, default_org_id):
|
|
# Verify update was successful
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["name"] == UPDATED_NAME
|
|
assert data["description"] == UPDATED_DESCRIPTION
|
|
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
|
|
assert data["autoAddCollections"] == [_coll_id]
|
|
assert data["firstSeed"] == "https://example.com/"
|
|
|
|
|
|
def test_update_config_invalid_format(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"config": {
|
|
"seeds": ["https://example.com/"],
|
|
"scopeType": "domain",
|
|
"limit": 10,
|
|
}
|
|
},
|
|
)
|
|
|
|
assert r.status_code == 422
|
|
|
|
|
|
def test_update_config_invalid_exclude_regex(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"config": {"exclude": "["}},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_regex"
|
|
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"config": {"exclude": ["abc.*", "["]}},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_regex"
|
|
|
|
|
|
def test_update_config_invalid_link_selector(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"config": {"selectLinks": []}},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_link_selector"
|
|
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"config": {"selectLinks": ["a[href]->href", "->href"]}},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_link_selector"
|
|
|
|
|
|
def test_update_config_invalid_lang(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
for invalid_code in ("f", "fra", "french"):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"config": {"lang": invalid_code}},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_lang"
|
|
|
|
|
|
def test_verify_default_select_links(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["config"]["selectLinks"] == ["a[href]->href"]
|
|
|
|
|
|
def test_verify_default_click_selector(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["config"]["clickSelector"] == "a"
|
|
|
|
|
|
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"config": {
|
|
"seeds": [{"url": "https://example.com/"}],
|
|
"scopeType": "domain",
|
|
"selectLinks": ["a[href]->href", "script[src]->src"],
|
|
"clickSelector": "button",
|
|
}
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
|
|
assert data["config"]["scopeType"] == "domain"
|
|
assert data["config"]["selectLinks"] == ["a[href]->href", "script[src]->src"]
|
|
assert data["config"]["clickSelector"] == "button"
|
|
|
|
|
|
def test_update_config_no_changes(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"config": {
|
|
"seeds": [{"url": "https://example.com/"}],
|
|
"scopeType": "domain",
|
|
"selectLinks": ["a[href]->href", "script[src]->src"],
|
|
"clickSelector": "button",
|
|
}
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["settings_changed"] == False
|
|
assert data["metadata_changed"] == False
|
|
|
|
|
|
def test_update_crawl_timeout(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
# Verify that updating crawl timeout works
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"crawlTimeout": 60},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["settings_changed"] == True
|
|
assert data["metadata_changed"] == False
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
|
|
assert data["crawlTimeout"] == 60
|
|
|
|
|
|
def test_update_max_crawl_size(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
# Verify that updating crawl timeout works
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"maxCrawlSize": 4096},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["settings_changed"] == True
|
|
assert data["metadata_changed"] == False
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
|
|
assert data["maxCrawlSize"] == 4096
|
|
|
|
|
|
def test_update_browser_windows(crawler_auth_headers, default_org_id):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"browserWindows": 1},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data.get("scale") is None
|
|
assert data["browserWindows"] == 1
|
|
|
|
|
|
def test_update_scale(crawler_auth_headers, default_org_id):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"scale": 1},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data.get("scale") is None
|
|
assert data["browserWindows"] == 2
|
|
|
|
|
|
def test_verify_delete_tags(crawler_auth_headers, default_org_id):
|
|
# Verify that deleting tags and name works as well
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"tags": [], "name": None},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert not data["name"]
|
|
assert data["tags"] == []
|
|
|
|
|
|
def test_verify_revs_history(crawler_auth_headers, default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/revs",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["total"] == 5
|
|
items = data["items"]
|
|
assert len(items) == 5
|
|
sorted_data = sorted(items, key=lambda revision: revision["rev"])
|
|
assert sorted_data[0]["config"]["scopeType"] == "prefix"
|
|
|
|
|
|
def test_workflow_total_size_and_last_crawl_stats(
|
|
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] > 0
|
|
items = data["items"]
|
|
for workflow in items:
|
|
assert workflow.get("config") is None
|
|
assert workflow["seedCount"]
|
|
assert workflow["firstSeed"]
|
|
|
|
last_crawl_id = workflow.get("lastCrawlId")
|
|
if last_crawl_id and last_crawl_id in (admin_crawl_id, crawler_crawl_id):
|
|
assert workflow["totalSize"] > 0
|
|
assert workflow["crawlCount"] > 0
|
|
assert workflow["crawlSuccessfulCount"] > 0
|
|
|
|
assert workflow["lastCrawlId"]
|
|
assert workflow["lastCrawlStartTime"]
|
|
assert workflow["lastStartedByName"]
|
|
assert workflow["lastCrawlTime"]
|
|
assert workflow["lastCrawlState"]
|
|
assert workflow["lastRun"]
|
|
assert workflow["lastCrawlSize"] > 0
|
|
|
|
if last_crawl_id == admin_crawl_id:
|
|
global _admin_crawl_cid
|
|
_admin_crawl_cid = workflow["id"]
|
|
assert _admin_crawl_cid
|
|
else:
|
|
assert workflow["totalSize"] == 0
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["totalSize"] > 0
|
|
assert data["crawlCount"] > 0
|
|
assert data["crawlSuccessfulCount"] > 0
|
|
|
|
assert data["lastCrawlId"]
|
|
assert data["lastCrawlStartTime"]
|
|
assert data["lastStartedByName"]
|
|
assert data["lastCrawlTime"]
|
|
assert data["lastCrawlState"]
|
|
assert data["lastRun"]
|
|
assert data["lastCrawlSize"] > 0
|
|
|
|
|
|
def test_incremental_workflow_total_size_and_last_crawl_stats(
|
|
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
|
|
):
|
|
# Get baseline values
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["crawlCount"] == 1
|
|
assert data["crawlSuccessfulCount"] == 1
|
|
total_size = data["totalSize"]
|
|
last_crawl_id = data["lastCrawlId"]
|
|
last_crawl_started = data["lastCrawlStartTime"]
|
|
last_crawl_finished = data["lastCrawlTime"]
|
|
last_run = data["lastRun"]
|
|
|
|
# Run new crawl in this workflow
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/run",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
crawl_id = r.json()["started"]
|
|
|
|
# Wait for it to complete
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "complete":
|
|
break
|
|
time.sleep(5)
|
|
|
|
# Give time for stats to re-compute
|
|
time.sleep(10)
|
|
|
|
# Re-check stats
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["crawlCount"] == 2
|
|
assert data["crawlSuccessfulCount"] == 2
|
|
assert data["totalSize"] > total_size
|
|
assert data["lastCrawlId"] == crawl_id
|
|
assert data["lastCrawlStartTime"] > last_crawl_started
|
|
assert data["lastCrawlTime"] > last_crawl_finished
|
|
assert data["lastRun"] > last_run
|
|
|
|
# Delete new crawl
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
|
headers=crawler_auth_headers,
|
|
json={"crawl_ids": [crawl_id]},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["deleted"] == 1
|
|
|
|
# Re-check stats
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["crawlCount"] == 1
|
|
assert data["crawlSuccessfulCount"] == 1
|
|
assert data["totalSize"] == total_size
|
|
assert data["lastCrawlId"] == last_crawl_id
|
|
assert data["lastCrawlStartTime"] == last_crawl_started
|
|
assert data["lastCrawlTime"] == last_crawl_finished
|
|
assert data["lastRun"] == last_run
|
|
|
|
|
|
def test_get_config_seeds(crawler_auth_headers, default_org_id, url_list_config_id):
|
|
# Make sure seeds aren't included in the crawlconfig detail
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json().get("config").get("seeds") is None
|
|
|
|
# Test getting seeds from separate endpoint
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
|
|
EXPECTED_SEED_URLS = [
|
|
"https://webrecorder.net/",
|
|
"https://example.com/",
|
|
"https://specs.webrecorder.net/",
|
|
]
|
|
found_seed_urls = []
|
|
|
|
for item in data["items"]:
|
|
found_seed_urls.append(item["url"])
|
|
|
|
assert sorted(found_seed_urls) == sorted(EXPECTED_SEED_URLS)
|
|
|
|
# Test getting seeds with low page size
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds?pageSize=2",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
items = data["items"]
|
|
assert len(items) == 2
|
|
for item in items:
|
|
assert item["url"] in EXPECTED_SEED_URLS
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{url_list_config_id}/seeds?pageSize=2&page=2",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] == 3
|
|
items = data["items"]
|
|
assert len(items) == 1
|
|
assert items[0]["url"] in EXPECTED_SEED_URLS
|
|
|
|
|
|
def test_get_crawler_channels(crawler_auth_headers, default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/crawler-channels",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
crawler_channels = r.json()["channels"]
|
|
assert crawler_channels
|
|
assert len(crawler_channels) == 2
|
|
for crawler_channel in crawler_channels:
|
|
assert crawler_channel["id"]
|
|
assert crawler_channel["image"]
|
|
|
|
|
|
def test_add_crawl_config_invalid_exclude_regex(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
sample_crawl_data["config"]["exclude"] = "["
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_regex"
|
|
|
|
sample_crawl_data["config"]["exclude"] = ["abc.*", "["]
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_regex"
|
|
|
|
|
|
def test_add_crawl_config_invalid_lang(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
for invalid_code in ("f", "fra", "french"):
|
|
sample_crawl_data["config"]["lang"] = invalid_code
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_lang"
|
|
|
|
|
|
def test_add_crawl_config_invalid_link_selectors(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
sample_crawl_data["config"]["selectLinks"] = []
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_link_selector"
|
|
|
|
sample_crawl_data["config"]["selectLinks"] = ["a[href]->href", "->href"]
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_link_selector"
|
|
|
|
|
|
def test_add_crawl_config_custom_behaviors_invalid_url(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
sample_crawl_data["config"]["customBehaviors"] = ["http"]
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_custom_behavior"
|
|
|
|
|
|
def test_add_crawl_config_custom_behaviors_valid_url(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js"
|
|
sample_crawl_data["config"]["customBehaviors"] = [url]
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
config_id = data["id"]
|
|
assert config_id
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == config_id
|
|
assert data["config"]["customBehaviors"] == [url]
|
|
|
|
|
|
def test_add_update_crawl_config_custom_behaviors_git_url(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
git_url = "git+https://github.com/webrecorder/custom-behaviors"
|
|
git_url_with_params = (
|
|
"git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors"
|
|
)
|
|
|
|
# Create workflow and validate it looks like we expect
|
|
sample_crawl_data["config"]["customBehaviors"] = [git_url]
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
config_id = data["id"]
|
|
assert config_id
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == config_id
|
|
assert data["config"]["customBehaviors"] == [git_url]
|
|
|
|
# Try to update custom behaviors with invalid url, validate unchanged
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"config": {
|
|
"customBehaviors": [git_url, "not-a-url"],
|
|
}
|
|
},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_custom_behavior"
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == config_id
|
|
assert data["config"]["customBehaviors"] == [git_url]
|
|
|
|
# Update custom behaviors with valid url, validate changed
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"config": {
|
|
"customBehaviors": [git_url_with_params],
|
|
}
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["id"] == config_id
|
|
assert data["config"]["customBehaviors"] == [git_url_with_params]
|
|
|
|
|
|
def test_validate_custom_behavior(crawler_auth_headers, default_org_id):
|
|
valid_url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js"
|
|
invalid_url_404 = "https://webrecorder.net/nonexistent/behavior.js"
|
|
doesnt_resolve_url = "https://nonexistenturl-for-testing-browsertrix.com"
|
|
malformed_url = "http"
|
|
|
|
git_url = "git+https://github.com/webrecorder/custom-behaviors"
|
|
invalid_git_url = "git+https://github.com/webrecorder/doesntexist"
|
|
private_git_url = "git+https://github.com/webrecorder/website"
|
|
|
|
git_url_with_params = (
|
|
"git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors"
|
|
)
|
|
git_url_invalid_branch = (
|
|
"git+https://github.com/webrecorder/custom-behaviors?branch=doesntexist"
|
|
)
|
|
|
|
# Success
|
|
for url in (valid_url, git_url, git_url_with_params):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
|
headers=crawler_auth_headers,
|
|
json={"customBehavior": url},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["success"]
|
|
|
|
# Behavior 404s
|
|
for url in (invalid_url_404, doesnt_resolve_url):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
|
headers=crawler_auth_headers,
|
|
json={"customBehavior": url},
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "custom_behavior_not_found"
|
|
|
|
# Malformed url
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
|
headers=crawler_auth_headers,
|
|
json={"customBehavior": malformed_url},
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "invalid_custom_behavior"
|
|
|
|
# Git repo doesn't exist
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
|
headers=crawler_auth_headers,
|
|
json={"customBehavior": invalid_git_url},
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "custom_behavior_not_found"
|
|
|
|
# Git repo isn't public
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
|
headers=crawler_auth_headers,
|
|
json={"customBehavior": private_git_url},
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "custom_behavior_not_found"
|
|
|
|
# Git branch doesn't exist
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
|
headers=crawler_auth_headers,
|
|
json={"customBehavior": git_url_invalid_branch},
|
|
)
|
|
assert r.status_code == 404
|
|
assert r.json()["detail"] == "custom_behavior_branch_not_found"
|
|
|
|
|
|
def test_add_crawl_config_with_seed_file(
|
|
crawler_auth_headers, default_org_id, seed_file_id, seed_file_config_id
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{seed_file_config_id}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["id"] == seed_file_config_id
|
|
assert data["name"] == "Seed File Test Crawl"
|
|
assert data["config"]["seedFileId"] == seed_file_id
|
|
assert data["config"]["seeds"] is None
|
|
|
|
|
|
def test_delete_in_use_seed_file(
|
|
crawler_auth_headers, default_org_id, seed_file_id, seed_file_config_id
|
|
):
|
|
# Attempt to delete in-use seed file, verify we get 400 response
|
|
r = requests.delete(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/files/{seed_file_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 400
|
|
assert r.json()["detail"] == "seed_file_in_use"
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/files/{seed_file_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["id"] == seed_file_id
|