browsertrix/backend/test_nightly/test_delete_crawls.py

import os
import requests
import time

from .conftest import API_PREFIX, HOST_PREFIX
from .utils import verify_file_and_replica_deleted


def test_delete_crawls(
    tmp_path, admin_auth_headers, default_org_id, crawl_id_wr, crawl_id_wr_specs
):
    # Check that crawls have associated files
    crawl_resource_urls = []

    def _file_is_retrievable(url):
        """Attempt to retrieve file at url and return True or False."""
        file_path = str(tmp_path / "test_download")
        if os.path.exists(file_path):
            os.remove(file_path)

        r = requests.get(f"{HOST_PREFIX}{url}")
        if not r.status_code == 200:
            return False

        with open(file_path, "wb") as fd:
            fd.write(r.content)

        if not (os.path.isfile(file_path) and os.path.getsize(file_path) > 0):
            return False

        os.remove(file_path)
        return True

    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
        headers=admin_auth_headers,
    )
    assert r.status_code == 200
    data = r.json()
    resources = data["resources"]
    assert resources
    for resource in resources:
        crawl_resource_urls.append(resource["path"])

    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
        headers=admin_auth_headers,
    )
    assert r.status_code == 200
    data = r.json()
    resources = data["resources"]
    assert resources
    for resource in resources:
        crawl_resource_urls.append(resource["path"])

    # Test retrieving resources
    for url in crawl_resource_urls:
        assert _file_is_retrievable(url)

    # Delete crawls
    r = requests.post(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
        headers=admin_auth_headers,
        json={"crawl_ids": [crawl_id_wr, crawl_id_wr_specs]},
    )
    assert r.status_code == 200
    data = r.json()

    assert data["deleted"]

    # Verify that crawls don't exist in db
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
        headers=admin_auth_headers,
    )
    assert r.status_code == 404

    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
        headers=admin_auth_headers,
    )
    assert r.status_code == 404

    # Give Minio time to delete the files
    time.sleep(120)

    # Verify that files are no longer retrievable from storage
    for url in crawl_resource_urls:
        assert not _file_is_retrievable(url)


def test_delete_replica_job_run(admin_auth_headers, default_org_id):
    time.sleep(20)

    # Verify delete replica job was successful
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/jobs?sortBy=started&sortDirection=-1&jobType=delete-replica",
        headers=admin_auth_headers,
    )
    assert r.status_code == 200
    latest_job = r.json()["items"][0]
    assert latest_job["type"] == "delete-replica"
    job_id = latest_job["id"]

    attempts = 0
    while attempts < 5:
        r = requests.get(
            f"{API_PREFIX}/orgs/{default_org_id}/jobs/{job_id}",
            headers=admin_auth_headers,
        )
        assert r.status_code == 200
        job = r.json()
        finished = latest_job.get("finished")
        if not finished:
            attempts += 1
            time.sleep(10)
            continue

        assert job["success"]
        break

    time.sleep(10)

    # Verify replica is no longer stored
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/jobs/{job_id}", headers=admin_auth_headers
    )
    assert r.status_code == 200
    job = r.json()
    verify_file_and_replica_deleted(job["file_path"])