Fixes #1252 Supports a generic background job system, with two background jobs, CreateReplicaJob and DeleteReplicaJob. - CreateReplicaJob runs on new crawls, uploads, profiles and updates the `replicas` array with the info about the replica after the job succeeds. - DeleteReplicaJob deletes the replica. - Both jobs are created from the new `replica_job.yaml` template. The CreateReplicaJob sets secrets for primary storage + replica storage, while DeleteReplicaJob only needs the replica storage. - The job is processed in the operator when the job is finalized (deleted), which should happen immediately when the job is done, either because it succeeds or because the backoffLimit is reached (currently set to 3). - /jobs/ api lists all jobs using a paginated response, including filtering and sorting - /jobs/<job id> returns details for a particular job - tests: nightly tests updated to check create + delete replica jobs for crawls as well as uploads, job api endpoints - tests: also fixes to timeouts in nightly tests to avoid crawls finishing too quickly. --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
70 lines
1.9 KiB
Python
70 lines
1.9 KiB
Python
"""nightly test utils"""
|
|
|
|
import requests
|
|
import hashlib
|
|
import os
|
|
import tempfile
|
|
|
|
import boto3
|
|
import pytest
|
|
|
|
|
|
from .conftest import API_PREFIX
|
|
|
|
|
|
def get_crawl_status(org_id, crawl_id, headers):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=headers,
|
|
)
|
|
data = r.json()
|
|
return data["state"]
|
|
|
|
|
|
def read_in_chunks(fh, blocksize=1024):
|
|
"""Lazy function (generator) to read a file piece by piece.
|
|
Default chunk size: 1k."""
|
|
while True:
|
|
data = fh.read(blocksize)
|
|
if not data:
|
|
break
|
|
yield data
|
|
|
|
|
|
def download_file_and_return_hash(bucket_name: str, file_path: str) -> str:
|
|
endpoint_url = f"http://127.0.0.1:30090/"
|
|
client = boto3.client(
|
|
"s3",
|
|
region_name="",
|
|
endpoint_url=endpoint_url,
|
|
aws_access_key_id="ADMIN",
|
|
aws_secret_access_key="PASSW0RD",
|
|
)
|
|
try:
|
|
response = client.get_object(Bucket=bucket_name, Key=file_path)
|
|
h = hashlib.sha256()
|
|
for chunk in response["Body"].iter_chunks():
|
|
h.update(chunk)
|
|
return h.hexdigest()
|
|
except client.exceptions.NoSuchKey:
|
|
raise
|
|
|
|
|
|
def verify_file_replicated(file_path: str):
|
|
assert "btrix-test-data/" not in file_path
|
|
assert "replica-0/" not in file_path
|
|
primary_file_hash = download_file_and_return_hash("btrix-test-data", file_path)
|
|
replica_file_hash = download_file_and_return_hash("replica-0", file_path)
|
|
assert primary_file_hash
|
|
assert replica_file_hash
|
|
assert primary_file_hash == replica_file_hash
|
|
|
|
|
|
def verify_file_and_replica_deleted(file_path: str):
|
|
assert "btrix-test-data/" not in file_path
|
|
assert "replica-0/" not in file_path
|
|
with pytest.raises(Exception):
|
|
download_file_and_return_hash("btrix-test-data", file_path)
|
|
with pytest.raises(Exception):
|
|
download_file_and_return_hash("replica-0", file_path)
|