Backend: - add 'maxCrawlSize' to models and crawljob spec - add 'MAX_CRAWL_SIZE' to configmap - add maxCrawlSize to new crawlconfig + update APIs - operator: gracefully stop crawl if current size (from stats) exceeds maxCrawlSize - tests: add max crawl size tests Frontend: - Add Max Crawl Size text box Limits tab - Users enter max crawl size in GB, convert to bytes - Add BYTES_PER_GB as constant for converting to bytes - docs: Crawl Size Limit to user guide workflow setup section Operator Refactor: - use 'status.stopping' instead of 'crawl.stopping' to indicate crawl is being stopped, as changing later has no effect in operator - add is_crawl_stopping() to return if crawl is being stopped, based on crawl.stopping or size or time limit being reached - crawlerjob status: store byte size under 'size', human readable size under 'sizeHuman' for clarity - size stat always exists so remove unneeded conditional (defaults to 0) - store raw byte size in 'size', human readable size in 'sizeHuman' Charts: - subchart: update crawlerjob crd in btrix-crds to show status.stopping instead of spec.stopping - subchart: show 'sizeHuman' property instead of 'size' - bump subchart version to 0.1.1 --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
295 lines
8.1 KiB
Python
295 lines
8.1 KiB
Python
import pytest
|
|
import requests
|
|
import time
|
|
import datetime
|
|
|
|
|
|
HOST_PREFIX = "http://127.0.0.1:30870"
|
|
API_PREFIX = HOST_PREFIX + "/api"
|
|
|
|
ADMIN_USERNAME = "admin@example.com"
|
|
ADMIN_PW = "PASSW0RD!"
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def admin_auth_headers():
|
|
while True:
|
|
r = requests.post(
|
|
f"{API_PREFIX}/auth/jwt/login",
|
|
data={
|
|
"username": ADMIN_USERNAME,
|
|
"password": ADMIN_PW,
|
|
"grant_type": "password",
|
|
},
|
|
)
|
|
data = r.json()
|
|
try:
|
|
return {"Authorization": f"Bearer {data['access_token']}"}
|
|
except:
|
|
print("Waiting for admin_auth_headers")
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def default_org_id(admin_auth_headers):
|
|
while True:
|
|
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
|
|
data = r.json()
|
|
try:
|
|
for org in data["items"]:
|
|
if org["default"] is True:
|
|
return org["id"]
|
|
except:
|
|
print("Waiting for default org id")
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawl_id_wr(admin_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Webrecorder admin test crawl",
|
|
"tags": ["wr", "nightly testing"],
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"limit": 1,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
crawl_id = data["run_now_job"]
|
|
# Wait for it to complete and then return crawl ID
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "complete":
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawl_id_wr_specs(admin_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Webrecorder Specs admin test crawl",
|
|
"tags": ["wr-specs", "nightly testing"],
|
|
"config": {
|
|
"seeds": [{"url": "https://specs.webrecorder.net/"}],
|
|
"limit": 1,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
crawl_id = data["run_now_job"]
|
|
# Wait for it to complete and then return crawl ID
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "complete":
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawl_config_info(admin_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Crawl config test",
|
|
"config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
crawl_config_id = data["id"]
|
|
crawl_id = data["run_now_job"]
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "complete":
|
|
break
|
|
time.sleep(5)
|
|
|
|
# Run second crawl from crawlconfig and return info when it finishes
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}/run",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
second_crawl_id = data["started"]
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{second_crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "complete":
|
|
return (crawl_config_id, crawl_id, second_crawl_id)
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def large_crawl_id(admin_auth_headers, default_org_id):
|
|
# Start crawl
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Large Test Crawl",
|
|
"tags": ["wacz-logs"],
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"scopeType": "domain",
|
|
"limit": 100,
|
|
"extraHops": 1,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
crawl_id = data["run_now_job"]
|
|
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "running":
|
|
# Give crawl time to start properly
|
|
time.sleep(30)
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def large_crawl_finished(admin_auth_headers, default_org_id, large_crawl_id):
|
|
# Wait for crawl to complete
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{large_crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "complete":
|
|
# Give some time for WACZ files to be stored
|
|
time.sleep(30)
|
|
break
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def timeout_crawl(admin_auth_headers, default_org_id):
|
|
# Start crawl
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Crawl with crawl timeout",
|
|
"crawlTimeout": 30,
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"scopeType": "domain",
|
|
"limit": 100,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
return data["run_now_job"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def max_crawl_size_crawl_id(admin_auth_headers, default_org_id):
|
|
# Start crawl
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Crawl with 5 MB max crawl size limit",
|
|
# Note crawl will exceed this size, as crawl begins to gracefully
|
|
# shut down when operator notices this value has been exceeded.
|
|
"maxCrawlSize": 5242880,
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"scopeType": "domain",
|
|
"limit": 100,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
return data["run_now_job"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def error_crawl_id(admin_auth_headers, default_org_id):
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Invalid URL crawl",
|
|
"config": {
|
|
"seeds": [
|
|
{"url": "https://invalid-x.webrecorder.net/"},
|
|
],
|
|
"limit": 1,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
crawl_id = data["run_now_job"]
|
|
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "complete":
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def org_with_quotas(admin_auth_headers):
|
|
name = "Quota Org " + datetime.datetime.utcnow().isoformat()
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/create", headers=admin_auth_headers, json={"name": name}
|
|
)
|
|
data = r.json()
|
|
|
|
return data["id"]
|