Fix POST /orgs/{oid}/crawls/delete (#591)

* Fix POST /orgs/{oid}/crawls/delete

- Add permissions check to ensure crawler users can only delete
their own crawls
- Fix broken delete_crawls endpoint
- Delete files from storage as well as deleting crawl from db
- Add tests, including nightly test that ensures crawl files are
no longer accessible after the crawl is deleted
This commit is contained in:
Tessa Walsh 2023-02-15 21:06:12 -05:00 committed by GitHub
parent 9532f48515
commit bd4fba7af7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 313 additions and 11 deletions

View File

@ -18,7 +18,7 @@ import pymongo
from .db import BaseMongoModel
from .users import User
from .orgs import Organization, MAX_CRAWL_SCALE
from .storages import get_presigned_url
from .storages import get_presigned_url, delete_crawl_file_object
# ============================================================================
@ -352,13 +352,26 @@ class CrawlOps:
for update in updates:
await self.crawls.find_one_and_update(*update)
async def delete_crawls(self, oid: uuid.UUID, delete_list: DeleteCrawlList):
async def delete_crawls(self, org: Organization, delete_list: DeleteCrawlList):
"""Delete a list of crawls by id for given org"""
for crawl_id in delete_list.crawl_ids:
await self._delete_crawl_files(org, crawl_id)
res = await self.crawls.delete_many(
{"_id": {"$in": delete_list.crawl_ids}, "oid": oid}
{"_id": {"$in": delete_list.crawl_ids}, "oid": org.id}
)
return res.deleted_count
async def _delete_crawl_files(self, org: Organization, crawl_id: str):
"""Delete files associated with crawl from storage."""
crawl_raw = await self.get_crawl_raw(crawl_id, org)
crawl = Crawl.from_dict(crawl_raw)
for file_ in crawl.files:
status_code = await delete_crawl_file_object(org, file_, self.crawl_manager)
if status_code != 204:
raise HTTPException(status_code=400, detail="file_deletion_error")
async def add_new_crawl(self, crawl_id: str, crawlconfig):
"""initialize new crawl"""
crawl = Crawl(
@ -651,17 +664,29 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
@app.post("/orgs/{oid}/crawls/delete", tags=["crawls"])
async def delete_crawls(
delete_list: DeleteCrawlList, org: Organization = Depends(org_crawl_dep)
delete_list: DeleteCrawlList,
user: User = Depends(user_dep),
org: Organization = Depends(org_crawl_dep),
):
try:
for crawl_id in delete_list:
await crawl_manager.stop_crawl(crawl_id, org.id, graceful=False)
# Ensure user has appropriate permissions for all crawls in list:
# - Crawler users can delete their own crawls
# - Org owners can delete any crawls in org
for crawl_id in delete_list.crawl_ids:
crawl_raw = await ops.get_crawl_raw(crawl_id, org)
crawl = Crawl.from_dict(crawl_raw)
if (crawl.userid != user.id) and not org.is_owner(user):
raise HTTPException(status_code=403, detail="Not Allowed")
except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=400, detail=f"Error Stopping Crawl: {exc}")
if not crawl.finished:
try:
await ops.shutdown_crawl(crawl_id, org, graceful=False)
except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(
status_code=400, detail=f"Error Stopping Crawl: {exc}"
)
res = await ops.delete_crawls(org.id, delete_list)
res = await ops.delete_crawls(org, delete_list)
return {"deleted": res}

View File

@ -122,3 +122,29 @@ async def get_presigned_url(org, crawlfile, crawl_manager, duration=3600):
)
return presigned_url
# ============================================================================
async def delete_crawl_file_object(org, crawlfile, crawl_manager):
"""delete crawl file from storage."""
status_code = None
if crawlfile.def_storage_name:
s3storage = await crawl_manager.get_default_storage(crawlfile.def_storage_name)
elif org.storage.type == "s3":
s3storage = org.storage
else:
raise TypeError("No Default Storage Found, Invalid Storage Type")
async with get_s3_client(s3storage, s3storage.use_access_for_presign) as (
client,
bucket,
key,
):
key += crawlfile.filename
response = await client.delete_object(Bucket=bucket, Key=key)
status_code = response["ResponseMetadata"]["HTTPStatusCode"]
return status_code

View File

@ -202,6 +202,34 @@ def crawler_crawl_id(crawler_auth_headers, default_org_id):
time.sleep(5)
@pytest.fixture(scope="session")
def wr_specs_crawl_id(crawler_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "Webrecorder Specs sample crawl",
"config": {"seeds": ["https://specs.webrecorder.net/"], "limit": 1},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=crawl_data,
)
data = r.json()
crawl_id = data["run_now_job"]
# Wait for it to complete and then return crawl ID
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
if data["state"] == "complete":
return crawl_id
time.sleep(5)
@pytest.fixture(scope="session")
def crawler_config_id(crawler_crawl_id):
return _crawler_config_id

View File

@ -150,3 +150,74 @@ def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
data = r.json()
assert data["tags"] == []
assert not data["notes"]
def test_delete_crawls_crawler(
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
):
# Test that crawler user can't delete another user's crawls
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=crawler_auth_headers,
json={"crawl_ids": [admin_crawl_id]},
)
assert r.status_code == 403
data = r.json()
assert data["detail"] == "Not Allowed"
# Test that crawler user can delete own crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=crawler_auth_headers,
json={"crawl_ids": [crawler_crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"] == 1
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 404
def test_delete_crawls_org_owner(
admin_auth_headers,
crawler_auth_headers,
default_org_id,
admin_crawl_id,
crawler_crawl_id,
wr_specs_crawl_id,
):
# Test that org owner can delete own crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=admin_auth_headers,
json={"crawl_ids": [admin_crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 404
# Test that org owner can delete another org user's crawls
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=admin_auth_headers,
json={"crawl_ids": [wr_specs_crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"] == 1
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{wr_specs_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 404

View File

@ -41,3 +41,67 @@ def default_org_id(admin_auth_headers):
except:
print("Waiting for default org id")
time.sleep(5)
@pytest.fixture(scope="session")
def crawl_id_wr(admin_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "Webrecorder admin test crawl",
"tags": ["wr", "nightly testing"],
"config": {
"seeds": ["https://webrecorder.net/"],
"limit": 1,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
crawl_id = data["run_now_job"]
# Wait for it to complete and then return crawl ID
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "complete":
return crawl_id
time.sleep(5)
@pytest.fixture(scope="session")
def crawl_id_wr_specs(admin_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "Webrecorder Specs admin test crawl",
"tags": ["wr-specs", "nightly testing"],
"config": {
"seeds": ["https://specs.webrecorder.net/"],
"limit": 1,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
crawl_id = data["run_now_job"]
# Wait for it to complete and then return crawl ID
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "complete":
return crawl_id
time.sleep(5)

View File

@ -0,0 +1,88 @@
import os
import requests
import time
from .conftest import API_PREFIX, HOST_PREFIX
def test_delete_crawls(
tmp_path, admin_auth_headers, default_org_id, crawl_id_wr, crawl_id_wr_specs
):
# Check that crawls have associated files
crawl_resource_urls = []
def _file_is_retrievable(url):
"""Attempt to retrieve file at url and return True or False."""
file_path = str(tmp_path / "test_download")
if os.path.exists(file_path):
os.remove(file_path)
r = requests.get(f"{HOST_PREFIX}{url}")
if not r.status_code == 200:
return False
with open(file_path, "wb") as fd:
fd.write(r.content)
if not (os.path.isfile(file_path) and os.path.getsize(file_path) > 0):
return False
os.remove(file_path)
return True
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
resources = data["resources"]
assert resources
for resource in resources:
crawl_resource_urls.append(resource["path"])
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
resources = data["resources"]
assert resources
for resource in resources:
crawl_resource_urls.append(resource["path"])
# Test retrieving resources
for url in crawl_resource_urls:
assert _file_is_retrievable(url)
# Delete crawls
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=admin_auth_headers,
json={"crawl_ids": [crawl_id_wr, crawl_id_wr_specs]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"] == 2
# Verify that crawls don't exist in db
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 404
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 404
# Give Minio time to delete the files
time.sleep(120)
# Verify that files are no longer retrievable from storage
for url in crawl_resource_urls:
assert not _file_is_retrievable(url)