* backend: make crawlconfigs mutable! (#656) - crawlconfig PATCH /{id} can now receive a new JSON config to replace the old one (in addition to scale, schedule, tags) - exclusions: add / remove APIs mutate the current crawlconfig, do not result in a new crawlconfig created - exclusions: ensure crawl job 'config' is updated when exclusions are added/removed, unify add/remove exclusions on crawl - k8s: crawlconfig json is updated along with scale - k8s: stateful set is restarted by updating annotation, instead of changing template - crawl object: now has 'config', as well as 'profileid', 'schedule', 'crawlTimeout', 'jobType' properties to ensure anything that is changeable is stored on the crawl - crawlconfigcore: store share properties between crawl and crawlconfig in new crawlconfigcore (includes 'schedule', 'jobType', 'config', 'profileid', 'schedule', 'crawlTimeout', 'tags', 'oid') - crawlconfig object: remove 'oldId', 'newId', disallow deactivating/deleting while crawl is running - rename 'userid' -> 'createdBy' - remove unused 'completions' field - add missing return to fix /run response - crawlout: ensure 'profileName' is resolved on CrawlOut from profileid - crawlout: return 'name' instead of 'configName' for consistent response - update: 'modified', 'modifiedBy' fields to set modification date and user modifying config - update: ensure PROFILE_FILENAME is updated in configmap is profileid provided, clear if profileid=="" - update: return 'settings_changed' and 'metadata_changed' if either crawl settings or metadata changed - tests: update tests to check settings_changed/metadata_changed return values add revision tracking to crawlconfig: - store each revision separate mongo db collection - revisions accessible via /crawlconfigs/{cid}/revs - store 'rev' int in crawlconfig and in crawljob - only add revision history if crawl config changed migration: - update to db v3 - copy fields from crawlconfig -> crawl - rename userid -> createdBy - copy userid -> modifiedBy, created -> modified - skip invalid crawls (missing config), make createdBy optional (just in case) frontend: Update crawl config keys with new API (#681), update frontend to use new PATCH endpoint, load config from crawl object in details view --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> Co-authored-by: sua yoo <sua@webrecorder.org> Co-authored-by: sua yoo <sua@suayoo.com>
157 lines
4.4 KiB
Python
157 lines
4.4 KiB
Python
import requests
|
|
|
|
from .conftest import API_PREFIX
|
|
|
|
|
|
cid = None
|
|
UPDATED_NAME = "Updated name"
|
|
UPDATED_TAGS = ["tag3", "tag4"]
|
|
|
|
|
|
def test_add_crawl_config(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
# Create crawl config
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
global cid
|
|
cid = data["added"]
|
|
|
|
|
|
def test_update_name_only(crawler_auth_headers, default_org_id):
|
|
# update name only
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"name": "updated name 1"},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["success"]
|
|
assert data["metadata_changed"] == True
|
|
assert data["settings_changed"] == False
|
|
|
|
|
|
def test_update_crawl_config_name_and_tags(crawler_auth_headers, default_org_id):
|
|
# Update crawl config
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"name": UPDATED_NAME, "tags": UPDATED_TAGS},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["success"]
|
|
assert data["metadata_changed"] == True
|
|
assert data["settings_changed"] == False
|
|
|
|
|
|
def test_verify_update(crawler_auth_headers, default_org_id):
|
|
# Verify update was successful
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["name"] == UPDATED_NAME
|
|
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
|
|
|
|
|
|
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"config": {"seeds": ["https://example.com/"], "scopeType": "domain"}},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
|
|
assert data["config"]["scopeType"] == "domain"
|
|
|
|
|
|
def test_update_config_no_changes(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"config": {"seeds": ["https://example.com/"], "scopeType": "domain"}},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["settings_changed"] == False
|
|
assert data["metadata_changed"] == False
|
|
|
|
|
|
def test_update_crawl_timeout(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
# Verify that updating crawl timeout works
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"crawlTimeout": 60},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["settings_changed"] == True
|
|
assert data["metadata_changed"] == False
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
|
|
assert data["crawlTimeout"] == 60
|
|
|
|
|
|
def test_verify_delete_tags(crawler_auth_headers, default_org_id):
|
|
# Verify that deleting tags and name works as well
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"tags": [], "name": None},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert not data["name"]
|
|
assert data["tags"] == []
|
|
|
|
|
|
def test_verify_revs_history(crawler_auth_headers, default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/revs",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert len(data) == 2
|
|
sorted_data = sorted(data, key=lambda revision: revision["rev"])
|
|
assert sorted_data[0]["config"]["scopeType"] == "prefix"
|