- Ability for pod to be Completed, unlike in Statefulset - eg. if 3 pods are running and first one finishes, all 3 must be running until all 3 are done. With this setup, the first finished pod can remain in Completed state. - Fixed shutdown order - crawler pods now correctly shutdown first before redis pods, by switching to background deletion. - Pod priority decreases with scale: 1st instance of a new crawl can preempt 3rd or 2nd instance of another crawl - Create priority classes upto 'max_crawl_scale, configured in values.yaml - Improved scale change reconciliation: if increasing scale, immediately scale up. If decreasing scale, graceful stop scaled-down instance to complete via redis 'stopone' key, wait until they exit with Completed state before adjust status.scale / removing scaled down pods. Ensures unaccepted interrupts don't cause scaled down data to be deleted. - Redis pod remains inactive until crawler is first active, or after no crawl pods are active for 60 seconds - Configurable Redis storage with 'redis_storage' value, set to 3Gi by default - CrawlJob deletion starts as soon as post-finish crawl operations are run - Post-crawl operations get their own redis instance, since one during response is being cleaned up in finalizer - Finalizer ignores request with incorrect state (returns 400 if reported as not finished while crawl is finished) - Current resource usage added to status - Profile browser: also manage single pod directly without statefulset for consistency. - Restart pods via restartTime value: if spec.restartTime != status.restartTime, clear out pods and update status.restartTime (using OnDelete policy to avoid recreate loops in edge cases). - Update to latest metacontroller (v4.11.0) - Add --restartOnError flag for crawler (for browsertrix-crawler 0.11.0) - Failed crawl logging: dd 'fail_crawl()' to be used for failing a crawl, which prints logs for default container (if enabled) as well as pod status - tests: check other finished states to avoid stuck in infinite loop if crawl fails - tests: disable disk utilization check, which adds unpredictability to crawl testing! fixes #1147 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
179 lines
4.7 KiB
Python
179 lines
4.7 KiB
Python
import requests
|
|
import time
|
|
import os
|
|
import pytest
|
|
|
|
from .conftest import API_PREFIX
|
|
|
|
crawl_id = None
|
|
|
|
|
|
def get_crawl(org_id, auth_headers, crawl_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
return r.json()
|
|
|
|
|
|
def test_start_crawl_to_cancel(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data.get("started")
|
|
|
|
global crawl_id
|
|
crawl_id = data["started"]
|
|
|
|
|
|
def test_cancel_crawl(default_org_id, crawler_auth_headers):
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
while data["state"] == "starting":
|
|
time.sleep(5)
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/cancel",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["success"] == True
|
|
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
while data["state"] in (
|
|
"starting",
|
|
"running",
|
|
"waiting_capacity",
|
|
"generate-wacz",
|
|
"uploading-wacz",
|
|
"pending-wait",
|
|
):
|
|
time.sleep(5)
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
assert data["state"] == "canceled"
|
|
assert data["stopping"] == False
|
|
|
|
assert len(data["resources"]) == 0
|
|
|
|
|
|
def test_start_crawl_and_stop_immediately(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
crawl_id = data["started"]
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["success"] == True
|
|
|
|
# test crawl
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
assert data["stopping"] == True
|
|
|
|
# test workflow
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.json()["lastCrawlStopping"] == True
|
|
|
|
while data["state"] in (
|
|
"starting",
|
|
"running",
|
|
"waiting_capacity",
|
|
"generate-wacz",
|
|
"uploading-wacz",
|
|
"pending-wait",
|
|
):
|
|
time.sleep(5)
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
assert data["state"] in ("canceled", "partial_complete")
|
|
assert data["stopping"] == True
|
|
|
|
|
|
def test_start_crawl_to_stop_partial(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
while True:
|
|
time.sleep(2)
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
if r.json().get("isCrawlRunning") is False:
|
|
break
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data.get("started")
|
|
|
|
global crawl_id
|
|
crawl_id = data["started"]
|
|
|
|
|
|
def test_stop_crawl_partial(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
done = False
|
|
while not done:
|
|
time.sleep(2)
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
done = data.get("stats") and data.get("stats").get("done") > 0
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["success"] == True
|
|
|
|
# test crawl
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
assert data["stopping"] == True
|
|
|
|
# test workflow
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.json()["lastCrawlStopping"] == True
|
|
|
|
while data["state"] in (
|
|
"running",
|
|
"generate-wacz",
|
|
"uploading-wacz",
|
|
"pending-wait",
|
|
):
|
|
time.sleep(5)
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
assert data["state"] in ("partial_complete", "complete")
|
|
assert data["stopping"] == True
|
|
|
|
assert len(data["resources"]) == 1
|