* Btrixjobs Operator - Phase 1 (#679) - add metacontroller and custom crds - add main_op entrypoint for operator * Btrix Operator Crawl Management (#767) * operator backend: - run operator api in separate container but in same pod, with WEB_CONCURRENCY=1 - operator creates statefulsets and services for CrawlJob and ProfileJob - operator: use service hook endpoint, set port in values.yaml * crawls working with CrawlJob - jobs start with 'crawljob-' prefix - update status to reflect current crawl state - set sync time to 10 seconds by default, overridable with 'operator_resync_seconds' - mark crawl as running, failed, complete when finished - store finished status when crawl is complete - support updating scale, forcing rollover, stop via patching CrawlJob - support cancel via deletion - requires hack to content-length for patching custom resources - auto-delete of CrawlJob via 'ttlSecondsAfterFinished' - also delete pvcs until autodelete supported via statefulset (k8s >1.27) - ensure filesAdded always set correctly, keep counter in redis, add to status display - optimization: attempt to reduce automerging, by reusing volumeClaimTemplates from existing children, as these may have additional props added - add add_crawl_errors_to_db() for storing crawl errors from redis '<crawl>:e' key to mongodb when crawl is finished/failed/canceled - add .status.size to display human-readable crawl size, if available (from webrecorder/browsertrix-crawler#291) - support new page size, >0.9.0 and old page size key (changed in webrecorder/browsertrix-crawler#284) * support for scheduled jobs! - add main_scheduled_job entrypoint to run scheduled jobs - add crawl_cron_job.yaml template for declaring CronJob - CronJobs moved to default namespace * operator manages ProfileJobs: - jobs start with 'profilejob-' - update expiry time by updating ProfileJob object 'expireTime' while profile is active * refactor/cleanup: - remove k8s package - merge k8sman and basecrawlmanager into crawlmanager - move templates, k8sapi, utils into root package - delete all *_job.py files - remove dt_now, ts_now from crawls, now in utils - all db operations happen in crawl/crawlconfig/org files - move shared crawl/crawlconfig/org functions that use the db to be importable directly, including get_crawl_config, add_new_crawl, inc_crawl_stats * role binding: more secure setup, don't allow crawler namespace any k8s permissions - move cronjobs to be created in default namespace - grant default namespace access to create cronjobs in default namespace - remove role binding from crawler namespace * additional tweaks to templates: - templates: split crawler and redis statefulset into separate yaml file (in case need to load one or other separately) * stats / redis optimization: - don't update stats in mongodb on every operator sync, only when crawl is finished - for api access, read stats directly from redis to get up-to-date stats - move get_page_stats() to utils, add get_redis_url() to k8sapi to unify access * Add migration for operator changes - Update configmap for crawl configs with scale > 1 or crawlTimeout > 0 and schedule exists to recreate CronJobs - add option to rerun last migration, enabled via env var and by running helm with --set=rerun_last_migration=1 * subcharts: move crawljob and profilejob crds to separate subchart, as this seems best way to guarantee proper install order with + update on upgrade with helm, add built btrix-crds-0.1.0.tgz subchart - metacontroller: use release from ghcr, add metacontroller-helm-v4.10.1.tgz subchart * backend api fixes - ensure changing scale of crawl also updates it in the db - crawlconfigs: add 'currCrawlSize' and 'lastCrawlSize' to crawlconfig api --------- Co-authored-by: D. Lee <leepro@gmail.com> Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
239 lines
6.6 KiB
Python
239 lines
6.6 KiB
Python
import requests
|
|
|
|
from .conftest import API_PREFIX
|
|
|
|
|
|
cid = None
|
|
UPDATED_NAME = "Updated name"
|
|
UPDATED_DESCRIPTION = "Updated description"
|
|
UPDATED_TAGS = ["tag3", "tag4"]
|
|
|
|
|
|
def test_add_crawl_config(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
# Create crawl config
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=sample_crawl_data,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
global cid
|
|
cid = data["added"]
|
|
|
|
|
|
def test_update_name_only(crawler_auth_headers, default_org_id):
|
|
# update name only
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"name": "updated name 1"},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["success"]
|
|
assert data["metadata_changed"] == True
|
|
assert data["settings_changed"] == False
|
|
|
|
|
|
def test_update_desription_only(crawler_auth_headers, default_org_id):
|
|
# update description only
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"description": "updated description"},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["success"]
|
|
assert data["metadata_changed"] == True
|
|
assert data["settings_changed"] == False
|
|
|
|
|
|
def test_update_crawl_config_metadata(crawler_auth_headers, default_org_id):
|
|
# Update crawl config
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"name": UPDATED_NAME,
|
|
"description": UPDATED_DESCRIPTION,
|
|
"tags": UPDATED_TAGS,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["success"]
|
|
assert data["metadata_changed"] == True
|
|
assert data["settings_changed"] == False
|
|
|
|
|
|
def test_verify_update(crawler_auth_headers, default_org_id):
|
|
# Verify update was successful
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["name"] == UPDATED_NAME
|
|
assert data["description"] == UPDATED_DESCRIPTION
|
|
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
|
|
|
|
|
|
def test_update_config_invalid_format(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"config": {
|
|
"seeds": ["https://example.com/"],
|
|
"scopeType": "domain",
|
|
"limit": 10,
|
|
}
|
|
},
|
|
)
|
|
|
|
assert r.status_code == 422
|
|
|
|
|
|
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"config": {
|
|
"seeds": [{"url": "https://example.com/"}],
|
|
"scopeType": "domain",
|
|
}
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
|
|
assert data["config"]["scopeType"] == "domain"
|
|
|
|
|
|
def test_update_config_no_changes(
|
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
):
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"config": {
|
|
"seeds": [{"url": "https://example.com/"}],
|
|
"scopeType": "domain",
|
|
}
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["settings_changed"] == False
|
|
assert data["metadata_changed"] == False
|
|
|
|
|
|
def test_update_crawl_timeout(crawler_auth_headers, default_org_id, sample_crawl_data):
|
|
# Verify that updating crawl timeout works
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"crawlTimeout": 60},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data["settings_changed"] == True
|
|
assert data["metadata_changed"] == False
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
|
|
assert data["crawlTimeout"] == 60
|
|
|
|
|
|
def test_verify_delete_tags(crawler_auth_headers, default_org_id):
|
|
# Verify that deleting tags and name works as well
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
json={"tags": [], "name": None},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert not data["name"]
|
|
assert data["tags"] == []
|
|
|
|
|
|
def test_verify_revs_history(crawler_auth_headers, default_org_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/revs",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["total"] == 2
|
|
items = data["items"]
|
|
assert len(items) == 2
|
|
sorted_data = sorted(items, key=lambda revision: revision["rev"])
|
|
assert sorted_data[0]["config"]["scopeType"] == "prefix"
|
|
|
|
|
|
def test_workflow_total_size(
|
|
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
|
|
):
|
|
admin_crawl_cid = ""
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] > 0
|
|
items = data["items"]
|
|
for workflow in items:
|
|
last_crawl_id = workflow.get("lastCrawlId")
|
|
if last_crawl_id and last_crawl_id in (admin_crawl_id, crawler_crawl_id):
|
|
assert workflow["totalSize"] > 0
|
|
if last_crawl_id == admin_crawl_id:
|
|
admin_crawl_cid = workflow["id"]
|
|
else:
|
|
assert workflow["totalSize"] == 0
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{admin_crawl_cid}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["totalSize"] > 0
|