- Adds two new crawl finished state, stopped_by_user and stopped_quota_reached - Tracking other possible 'stop reasons' in operator, though not making them distinct states for now. - Updated frontend with 'Stopped by User' and 'Stopped: Time Quota Reached', shown with same icon as current partial_complete - Added migration of partial_complete to either stopped_by_user or complete (no historical quota data available) - Addresses edge case in scaling: if crawl never scaled (no redis entry, no pod), automatically scale down - Edge case in status: if crawl is somehow 'canceled' but not deleted, immediately delete crawl object and begin finalizing. --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
80 lines
2.3 KiB
Python
80 lines
2.3 KiB
Python
import requests
|
|
import time
|
|
|
|
from .conftest import API_PREFIX
|
|
from .utils import verify_file_replicated
|
|
|
|
|
|
def test_crawl_timeout(admin_auth_headers, default_org_id, timeout_crawl):
|
|
# Verify that crawl has started
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{timeout_crawl}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["state"] in ("starting", "running")
|
|
|
|
# Wait some time to let crawl start, hit timeout, and gracefully stop
|
|
time.sleep(60)
|
|
|
|
# Verify crawl was stopped
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{timeout_crawl}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["state"] == "complete"
|
|
|
|
|
|
def test_crawl_files_replicated(admin_auth_headers, default_org_id, timeout_crawl):
|
|
time.sleep(20)
|
|
|
|
# Verify replication job was successful
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/jobs?sortBy=started&sortDirection=1&jobType=create-replica",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
latest_job = r.json()["items"][0]
|
|
assert latest_job["type"] == "create-replica"
|
|
job_id = latest_job["id"]
|
|
|
|
attempts = 0
|
|
while attempts < 5:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/jobs/{job_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
job = r.json()
|
|
finished = latest_job.get("finished")
|
|
if not finished:
|
|
attempts += 1
|
|
time.sleep(10)
|
|
continue
|
|
|
|
assert job["success"]
|
|
break
|
|
|
|
# Assert file was updated
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{timeout_crawl}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
files = data.get("resources")
|
|
assert files
|
|
for file_ in files:
|
|
assert file_["numReplicas"] == 1
|
|
|
|
# Verify replica is stored
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/jobs/{job_id}", headers=admin_auth_headers
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
verify_file_replicated(data["file_path"])
|