browsertrix/backend/test_nightly/test_upload_replicas.py
Ilya Kreymer b4fd5e6e94
Crawl Timeout via elapsed time (#1338)
Fixes #1337 

Crawl timeout is tracked via `elapsedCrawlTime` field on the crawl
status, which is similar to regular crawl execution time, but only
counts one pod if scale > 1. If scale == 1, this time is equivalent.

Crawl is gracefully stopped when the elapsed execution time exceeds the
timeout. For more responsiveness, also adding current crawl time since
last update interval.

Details:
- handle crawl timeout via elapsed crawl time - longest running time of a
single pod, instead of expire time.
- include current running from last update for best precision
- more accurately count elapsed time crawl is actually running
- store elapsedCrawlTime in addition to crawlExecTime, storing the
longest duration of each pod since last test interval

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-11-06 16:32:58 -08:00

141 lines
3.8 KiB
Python

import time
import os
import requests
from .conftest import API_PREFIX
from .utils import (
read_in_chunks,
verify_file_replicated,
verify_file_and_replica_deleted,
)
curr_dir = os.path.dirname(os.path.realpath(__file__))
def test_upload_stream(admin_auth_headers, default_org_id):
with open(os.path.join(curr_dir, "..", "test", "data", "example.wacz"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz",
headers=admin_auth_headers,
data=read_in_chunks(fh),
)
assert r.status_code == 200
assert r.json()["added"]
global upload_id
upload_id = r.json()["id"]
def test_upload_file_replicated(admin_auth_headers, default_org_id):
time.sleep(20)
# Verify replication job was successful
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/jobs?sortBy=started&sortDirection=-1&jobType=create-replica",
headers=admin_auth_headers,
)
assert r.status_code == 200
latest_job = r.json()["items"][0]
assert latest_job["type"] == "create-replica"
job_id = latest_job["id"]
attempts = 0
while attempts < 5:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/jobs/{job_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
job = r.json()
finished = latest_job.get("finished")
if not finished:
attempts += 1
time.sleep(10)
continue
assert job["success"]
break
# Verify file updated
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
files = data.get("resources")
assert files
for file_ in files:
assert file_["numReplicas"] == 1
# Verify replica is stored
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/jobs/{job_id}", headers=admin_auth_headers
)
assert r.status_code == 200
job = r.json()
print(job["file_path"])
verify_file_replicated(job["file_path"])
def test_delete_upload_and_replicas(admin_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete",
headers=admin_auth_headers,
json={"crawl_ids": [upload_id]},
)
data = r.json()
assert data["deleted"]
assert data["storageQuotaReached"] is False
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers,
)
results = r.json()
for res in results["items"]:
if res["id"] == upload_id:
assert False
time.sleep(20)
# Verify delete replica job was successful
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/jobs?sortBy=started&sortDirection=-1&jobType=delete-replica",
headers=admin_auth_headers,
)
assert r.status_code == 200
latest_job = r.json()["items"][0]
assert latest_job["type"] == "delete-replica"
job_id = latest_job["id"]
attempts = 0
while attempts < 5:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/jobs/{job_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
job = r.json()
finished = latest_job.get("finished")
if not finished:
attempts += 1
time.sleep(10)
continue
assert job["success"]
break
time.sleep(10)
# Verify replica is no longer stored
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/jobs/{job_id}", headers=admin_auth_headers
)
assert r.status_code == 200
job = r.json()
verify_file_and_replica_deleted(job["file_path"])