browsertrix/backend/test_nightly/test_delete_crawls.py
Ilya Kreymer fb3d88291f
Background Jobs Work (#1321)
Fixes #1252 

Supports a generic background job system, with two background jobs,
CreateReplicaJob and DeleteReplicaJob.
- CreateReplicaJob runs on new crawls, uploads, profiles and updates the
`replicas` array with the info about the replica after the job succeeds.
- DeleteReplicaJob deletes the replica.
- Both jobs are created from the new `replica_job.yaml` template. The
CreateReplicaJob sets secrets for primary storage + replica storage,
while DeleteReplicaJob only needs the replica storage.
- The job is processed in the operator when the job is finalized
(deleted), which should happen immediately when the job is done, either
because it succeeds or because the backoffLimit is reached (currently
set to 3).
- /jobs/ api lists all jobs using a paginated response, including filtering and sorting
- /jobs/<job id> returns details for a particular job
- tests: nightly tests updated to check create + delete replica jobs for crawls as well as uploads, job api endpoints
- tests: also fixes to timeouts in nightly tests to avoid crawls finishing too quickly.

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-11-02 13:02:17 -07:00

131 lines
3.7 KiB
Python

import os
import requests
import time
from .conftest import API_PREFIX, HOST_PREFIX
from .utils import verify_file_and_replica_deleted
def test_delete_crawls(
tmp_path, admin_auth_headers, default_org_id, crawl_id_wr, crawl_id_wr_specs
):
# Check that crawls have associated files
crawl_resource_urls = []
def _file_is_retrievable(url):
"""Attempt to retrieve file at url and return True or False."""
file_path = str(tmp_path / "test_download")
if os.path.exists(file_path):
os.remove(file_path)
r = requests.get(f"{HOST_PREFIX}{url}")
if not r.status_code == 200:
return False
with open(file_path, "wb") as fd:
fd.write(r.content)
if not (os.path.isfile(file_path) and os.path.getsize(file_path) > 0):
return False
os.remove(file_path)
return True
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
resources = data["resources"]
assert resources
for resource in resources:
crawl_resource_urls.append(resource["path"])
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
resources = data["resources"]
assert resources
for resource in resources:
crawl_resource_urls.append(resource["path"])
# Test retrieving resources
for url in crawl_resource_urls:
assert _file_is_retrievable(url)
# Delete crawls
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=admin_auth_headers,
json={"crawl_ids": [crawl_id_wr, crawl_id_wr_specs]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"]
# Verify that crawls don't exist in db
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 404
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 404
# Give Minio time to delete the files
time.sleep(120)
# Verify that files are no longer retrievable from storage
for url in crawl_resource_urls:
assert not _file_is_retrievable(url)
def test_delete_replica_job_run(admin_auth_headers, default_org_id):
time.sleep(20)
# Verify delete replica job was successful
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/jobs?sortBy=started&sortDirection=-1&jobType=delete-replica",
headers=admin_auth_headers,
)
assert r.status_code == 200
latest_job = r.json()["items"][0]
assert latest_job["type"] == "delete-replica"
job_id = latest_job["id"]
attempts = 0
while attempts < 5:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/jobs/{job_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
job = r.json()
finished = latest_job.get("finished")
if not finished:
attempts += 1
time.sleep(10)
continue
assert job["success"]
break
time.sleep(10)
# Verify replica is no longer stored
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/jobs/{job_id}", headers=admin_auth_headers
)
assert r.status_code == 200
job = r.json()
verify_file_and_replica_deleted(job["file_path"])