- Ability for pod to be Completed, unlike in Statefulset - eg. if 3 pods are running and first one finishes, all 3 must be running until all 3 are done. With this setup, the first finished pod can remain in Completed state. - Fixed shutdown order - crawler pods now correctly shutdown first before redis pods, by switching to background deletion. - Pod priority decreases with scale: 1st instance of a new crawl can preempt 3rd or 2nd instance of another crawl - Create priority classes upto 'max_crawl_scale, configured in values.yaml - Improved scale change reconciliation: if increasing scale, immediately scale up. If decreasing scale, graceful stop scaled-down instance to complete via redis 'stopone' key, wait until they exit with Completed state before adjust status.scale / removing scaled down pods. Ensures unaccepted interrupts don't cause scaled down data to be deleted. - Redis pod remains inactive until crawler is first active, or after no crawl pods are active for 60 seconds - Configurable Redis storage with 'redis_storage' value, set to 3Gi by default - CrawlJob deletion starts as soon as post-finish crawl operations are run - Post-crawl operations get their own redis instance, since one during response is being cleaned up in finalizer - Finalizer ignores request with incorrect state (returns 400 if reported as not finished while crawl is finished) - Current resource usage added to status - Profile browser: also manage single pod directly without statefulset for consistency. - Restart pods via restartTime value: if spec.restartTime != status.restartTime, clear out pods and update status.restartTime (using OnDelete policy to avoid recreate loops in edge cases). - Update to latest metacontroller (v4.11.0) - Add --restartOnError flag for crawler (for browsertrix-crawler 0.11.0) - Failed crawl logging: dd 'fail_crawl()' to be used for failing a crawl, which prints logs for default container (if enabled) as well as pod status - tests: check other finished states to avoid stuck in infinite loop if crawl fails - tests: disable disk utilization check, which adds unpredictability to crawl testing! fixes #1147 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
404 lines
11 KiB
Python
404 lines
11 KiB
Python
import pytest
|
|
import requests
|
|
import time
|
|
|
|
|
|
HOST_PREFIX = "http://127.0.0.1:30870"
|
|
API_PREFIX = HOST_PREFIX + "/api"
|
|
|
|
ADMIN_USERNAME = "admin@example.com"
|
|
ADMIN_PW = "PASSW0RD!"
|
|
|
|
VIEWER_USERNAME = "viewer@example.com"
|
|
VIEWER_PW = "viewerPASSW0RD!"
|
|
|
|
CRAWLER_USERNAME = "crawler@example.com"
|
|
CRAWLER_PW = "crawlerPASSWORD!"
|
|
|
|
_admin_config_id = None
|
|
_crawler_config_id = None
|
|
_auto_add_config_id = None
|
|
_all_crawls_config_id = None
|
|
|
|
NON_DEFAULT_ORG_NAME = "Non-default org"
|
|
|
|
FINISHED_STATES = ("complete", "partial_complete", "canceled", "failed")
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def admin_auth_headers():
|
|
while True:
|
|
r = requests.post(
|
|
f"{API_PREFIX}/auth/jwt/login",
|
|
data={
|
|
"username": ADMIN_USERNAME,
|
|
"password": ADMIN_PW,
|
|
"grant_type": "password",
|
|
},
|
|
)
|
|
data = r.json()
|
|
try:
|
|
return {"Authorization": f"Bearer {data['access_token']}"}
|
|
except:
|
|
print("Waiting for admin_auth_headers")
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def default_org_id(admin_auth_headers):
|
|
while True:
|
|
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
|
|
data = r.json()
|
|
try:
|
|
for org in data["items"]:
|
|
if org["default"] is True:
|
|
return org["id"]
|
|
except:
|
|
print("Waiting for default org id")
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def non_default_org_id(admin_auth_headers):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/create",
|
|
headers=admin_auth_headers,
|
|
json={"name": NON_DEFAULT_ORG_NAME},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
while True:
|
|
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
|
|
data = r.json()
|
|
try:
|
|
for org in data["items"]:
|
|
if org["name"] == NON_DEFAULT_ORG_NAME:
|
|
return org["id"]
|
|
except:
|
|
print("Waiting for non-default org id")
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def admin_crawl_id(admin_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Admin Test Crawl",
|
|
"description": "Admin Test Crawl description",
|
|
"tags": ["wr-test-1", "wr-test-2"],
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"exclude": "community",
|
|
# limit now set via 'max_pages_per_crawl' global limit
|
|
# "limit": 1,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
global _admin_config_id
|
|
_admin_config_id = data["id"]
|
|
|
|
crawl_id = data["run_now_job"]
|
|
# Wait for it to complete and then return crawl ID
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] in FINISHED_STATES:
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def admin_config_id(admin_crawl_id):
|
|
return _admin_config_id
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def admin_userid(admin_auth_headers):
|
|
r = requests.get(f"{API_PREFIX}/users/me", headers=admin_auth_headers)
|
|
return r.json()["id"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def viewer_auth_headers(admin_auth_headers, default_org_id):
|
|
requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/add-user",
|
|
json={
|
|
"email": VIEWER_USERNAME,
|
|
"password": VIEWER_PW,
|
|
"name": "newviewer",
|
|
"role": 10,
|
|
},
|
|
headers=admin_auth_headers,
|
|
)
|
|
r = requests.post(
|
|
f"{API_PREFIX}/auth/jwt/login",
|
|
data={
|
|
"username": VIEWER_USERNAME,
|
|
"password": VIEWER_PW,
|
|
"grant_type": "password",
|
|
},
|
|
)
|
|
data = r.json()
|
|
access_token = data.get("access_token")
|
|
return {"Authorization": f"Bearer {access_token}"}
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawler_auth_headers(admin_auth_headers, default_org_id):
|
|
requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/add-user",
|
|
json={
|
|
"email": CRAWLER_USERNAME,
|
|
"password": CRAWLER_PW,
|
|
"name": "new-crawler",
|
|
"description": "crawler test crawl",
|
|
"role": 20,
|
|
},
|
|
headers=admin_auth_headers,
|
|
)
|
|
r = requests.post(
|
|
f"{API_PREFIX}/auth/jwt/login",
|
|
data={
|
|
"username": CRAWLER_USERNAME,
|
|
"password": CRAWLER_PW,
|
|
"grant_type": "password",
|
|
},
|
|
)
|
|
data = r.json()
|
|
access_token = data.get("access_token")
|
|
return {"Authorization": f"Bearer {access_token}"}
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawler_userid(crawler_auth_headers):
|
|
r = requests.get(f"{API_PREFIX}/users/me", headers=crawler_auth_headers)
|
|
return r.json()["id"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def _crawler_create_config_only(crawler_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": False,
|
|
"name": "Crawler User Test Crawl",
|
|
"description": "crawler test crawl",
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"pageExtraDelay": 10,
|
|
"limit": 3,
|
|
"exclude": "community",
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
global _crawler_config_id
|
|
_crawler_config_id = data["id"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawler_crawl_id(crawler_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Crawler User Test Crawl",
|
|
"description": "crawler test crawl",
|
|
"tags": ["wr-test-2"],
|
|
"config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 1},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
global _crawler_config_id
|
|
_crawler_config_id = data["id"]
|
|
|
|
crawl_id = data["run_now_job"]
|
|
# Wait for it to complete and then return crawl ID
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] in FINISHED_STATES:
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def wr_specs_crawl_id(crawler_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Webrecorder Specs sample crawl",
|
|
"config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
crawl_id = data["run_now_job"]
|
|
# Wait for it to complete and then return crawl ID
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] in FINISHED_STATES:
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawler_config_id(crawler_crawl_id):
|
|
return _crawler_config_id
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawler_config_id_only(_crawler_create_config_only):
|
|
return _crawler_config_id
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def sample_crawl_data():
|
|
return {
|
|
"runNow": False,
|
|
"name": "Test Crawl",
|
|
"config": {"seeds": [{"url": "https://example.com/"}]},
|
|
"tags": ["tag1", "tag2"],
|
|
}
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def auto_add_collection_id(crawler_auth_headers, default_org_id):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={"name": "Auto Add Collection"},
|
|
)
|
|
assert r.status_code == 200
|
|
return r.json()["id"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def auto_add_crawl_id(crawler_auth_headers, default_org_id, auto_add_collection_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Auto Add",
|
|
"description": "For testing auto-adding new workflow crawls to collections",
|
|
"autoAddCollections": [auto_add_collection_id],
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"limit": 1,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
global _auto_add_config_id
|
|
_auto_add_config_id = data["id"]
|
|
|
|
crawl_id = data["run_now_job"]
|
|
# Wait for it to complete and then return crawl ID
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] in FINISHED_STATES:
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def auto_add_config_id(auto_add_crawl_id):
|
|
return _auto_add_config_id
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def all_crawls_crawl_id(crawler_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "All Crawls Test Crawl",
|
|
"description": "Lorem ipsum",
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"exclude": "community",
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
global _all_crawls_config_id
|
|
_all_crawls_config_id = data["id"]
|
|
|
|
crawl_id = data["run_now_job"]
|
|
# Wait for it to complete and then return crawl ID
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] in FINISHED_STATES:
|
|
break
|
|
time.sleep(5)
|
|
|
|
# Add description to crawl
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}",
|
|
headers=crawler_auth_headers,
|
|
json={"description": "Lorem ipsum"},
|
|
)
|
|
assert r.status_code == 200
|
|
return crawl_id
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def all_crawls_config_id(all_crawls_crawl_id):
|
|
return _all_crawls_config_id
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def uploads_collection_id(crawler_auth_headers, default_org_id):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={"name": "Upload test collection"},
|
|
)
|
|
assert r.status_code == 200
|
|
return r.json()["id"]
|