browsertrix/backend/test_nightly/test_delete_crawls.py
Tessa Walsh bd4fba7af7
Fix POST /orgs/{oid}/crawls/delete (#591)
* Fix POST /orgs/{oid}/crawls/delete

- Add permissions check to ensure crawler users can only delete
their own crawls
- Fix broken delete_crawls endpoint
- Delete files from storage as well as deleting crawl from db
- Add tests, including nightly test that ensures crawl files are
no longer accessible after the crawl is deleted
2023-02-15 21:06:12 -05:00

89 lines
2.5 KiB
Python

import os
import requests
import time
from .conftest import API_PREFIX, HOST_PREFIX
def test_delete_crawls(
tmp_path, admin_auth_headers, default_org_id, crawl_id_wr, crawl_id_wr_specs
):
# Check that crawls have associated files
crawl_resource_urls = []
def _file_is_retrievable(url):
"""Attempt to retrieve file at url and return True or False."""
file_path = str(tmp_path / "test_download")
if os.path.exists(file_path):
os.remove(file_path)
r = requests.get(f"{HOST_PREFIX}{url}")
if not r.status_code == 200:
return False
with open(file_path, "wb") as fd:
fd.write(r.content)
if not (os.path.isfile(file_path) and os.path.getsize(file_path) > 0):
return False
os.remove(file_path)
return True
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
resources = data["resources"]
assert resources
for resource in resources:
crawl_resource_urls.append(resource["path"])
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
resources = data["resources"]
assert resources
for resource in resources:
crawl_resource_urls.append(resource["path"])
# Test retrieving resources
for url in crawl_resource_urls:
assert _file_is_retrievable(url)
# Delete crawls
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=admin_auth_headers,
json={"crawl_ids": [crawl_id_wr, crawl_id_wr_specs]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"] == 2
# Verify that crawls don't exist in db
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 404
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 404
# Give Minio time to delete the files
time.sleep(120)
# Verify that files are no longer retrievable from storage
for url in crawl_resource_urls:
assert not _file_is_retrievable(url)