Fix POST /orgs/{oid}/crawls/delete (#591)
* Fix POST /orgs/{oid}/crawls/delete - Add permissions check to ensure crawler users can only delete their own crawls - Fix broken delete_crawls endpoint - Delete files from storage as well as deleting crawl from db - Add tests, including nightly test that ensures crawl files are no longer accessible after the crawl is deleted
This commit is contained in:
parent
9532f48515
commit
bd4fba7af7
@ -18,7 +18,7 @@ import pymongo
|
||||
from .db import BaseMongoModel
|
||||
from .users import User
|
||||
from .orgs import Organization, MAX_CRAWL_SCALE
|
||||
from .storages import get_presigned_url
|
||||
from .storages import get_presigned_url, delete_crawl_file_object
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -352,13 +352,26 @@ class CrawlOps:
|
||||
for update in updates:
|
||||
await self.crawls.find_one_and_update(*update)
|
||||
|
||||
async def delete_crawls(self, oid: uuid.UUID, delete_list: DeleteCrawlList):
|
||||
async def delete_crawls(self, org: Organization, delete_list: DeleteCrawlList):
|
||||
"""Delete a list of crawls by id for given org"""
|
||||
for crawl_id in delete_list.crawl_ids:
|
||||
await self._delete_crawl_files(org, crawl_id)
|
||||
|
||||
res = await self.crawls.delete_many(
|
||||
{"_id": {"$in": delete_list.crawl_ids}, "oid": oid}
|
||||
{"_id": {"$in": delete_list.crawl_ids}, "oid": org.id}
|
||||
)
|
||||
|
||||
return res.deleted_count
|
||||
|
||||
async def _delete_crawl_files(self, org: Organization, crawl_id: str):
|
||||
"""Delete files associated with crawl from storage."""
|
||||
crawl_raw = await self.get_crawl_raw(crawl_id, org)
|
||||
crawl = Crawl.from_dict(crawl_raw)
|
||||
for file_ in crawl.files:
|
||||
status_code = await delete_crawl_file_object(org, file_, self.crawl_manager)
|
||||
if status_code != 204:
|
||||
raise HTTPException(status_code=400, detail="file_deletion_error")
|
||||
|
||||
async def add_new_crawl(self, crawl_id: str, crawlconfig):
|
||||
"""initialize new crawl"""
|
||||
crawl = Crawl(
|
||||
@ -651,17 +664,29 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
|
||||
|
||||
@app.post("/orgs/{oid}/crawls/delete", tags=["crawls"])
|
||||
async def delete_crawls(
|
||||
delete_list: DeleteCrawlList, org: Organization = Depends(org_crawl_dep)
|
||||
delete_list: DeleteCrawlList,
|
||||
user: User = Depends(user_dep),
|
||||
org: Organization = Depends(org_crawl_dep),
|
||||
):
|
||||
try:
|
||||
for crawl_id in delete_list:
|
||||
await crawl_manager.stop_crawl(crawl_id, org.id, graceful=False)
|
||||
# Ensure user has appropriate permissions for all crawls in list:
|
||||
# - Crawler users can delete their own crawls
|
||||
# - Org owners can delete any crawls in org
|
||||
for crawl_id in delete_list.crawl_ids:
|
||||
crawl_raw = await ops.get_crawl_raw(crawl_id, org)
|
||||
crawl = Crawl.from_dict(crawl_raw)
|
||||
if (crawl.userid != user.id) and not org.is_owner(user):
|
||||
raise HTTPException(status_code=403, detail="Not Allowed")
|
||||
|
||||
except Exception as exc:
|
||||
# pylint: disable=raise-missing-from
|
||||
raise HTTPException(status_code=400, detail=f"Error Stopping Crawl: {exc}")
|
||||
if not crawl.finished:
|
||||
try:
|
||||
await ops.shutdown_crawl(crawl_id, org, graceful=False)
|
||||
except Exception as exc:
|
||||
# pylint: disable=raise-missing-from
|
||||
raise HTTPException(
|
||||
status_code=400, detail=f"Error Stopping Crawl: {exc}"
|
||||
)
|
||||
|
||||
res = await ops.delete_crawls(org.id, delete_list)
|
||||
res = await ops.delete_crawls(org, delete_list)
|
||||
|
||||
return {"deleted": res}
|
||||
|
||||
|
@ -122,3 +122,29 @@ async def get_presigned_url(org, crawlfile, crawl_manager, duration=3600):
|
||||
)
|
||||
|
||||
return presigned_url
|
||||
|
||||
|
||||
# ============================================================================
|
||||
async def delete_crawl_file_object(org, crawlfile, crawl_manager):
|
||||
"""delete crawl file from storage."""
|
||||
status_code = None
|
||||
|
||||
if crawlfile.def_storage_name:
|
||||
s3storage = await crawl_manager.get_default_storage(crawlfile.def_storage_name)
|
||||
|
||||
elif org.storage.type == "s3":
|
||||
s3storage = org.storage
|
||||
|
||||
else:
|
||||
raise TypeError("No Default Storage Found, Invalid Storage Type")
|
||||
|
||||
async with get_s3_client(s3storage, s3storage.use_access_for_presign) as (
|
||||
client,
|
||||
bucket,
|
||||
key,
|
||||
):
|
||||
key += crawlfile.filename
|
||||
response = await client.delete_object(Bucket=bucket, Key=key)
|
||||
status_code = response["ResponseMetadata"]["HTTPStatusCode"]
|
||||
|
||||
return status_code
|
||||
|
@ -202,6 +202,34 @@ def crawler_crawl_id(crawler_auth_headers, default_org_id):
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def wr_specs_crawl_id(crawler_auth_headers, default_org_id):
|
||||
# Start crawl.
|
||||
crawl_data = {
|
||||
"runNow": True,
|
||||
"name": "Webrecorder Specs sample crawl",
|
||||
"config": {"seeds": ["https://specs.webrecorder.net/"], "limit": 1},
|
||||
}
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||
headers=crawler_auth_headers,
|
||||
json=crawl_data,
|
||||
)
|
||||
data = r.json()
|
||||
|
||||
crawl_id = data["run_now_job"]
|
||||
# Wait for it to complete and then return crawl ID
|
||||
while True:
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
if data["state"] == "complete":
|
||||
return crawl_id
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def crawler_config_id(crawler_crawl_id):
|
||||
return _crawler_config_id
|
||||
|
@ -150,3 +150,74 @@ def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
|
||||
data = r.json()
|
||||
assert data["tags"] == []
|
||||
assert not data["notes"]
|
||||
|
||||
|
||||
def test_delete_crawls_crawler(
|
||||
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
|
||||
):
|
||||
# Test that crawler user can't delete another user's crawls
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
||||
headers=crawler_auth_headers,
|
||||
json={"crawl_ids": [admin_crawl_id]},
|
||||
)
|
||||
assert r.status_code == 403
|
||||
data = r.json()
|
||||
assert data["detail"] == "Not Allowed"
|
||||
|
||||
# Test that crawler user can delete own crawl
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
||||
headers=crawler_auth_headers,
|
||||
json={"crawl_ids": [crawler_crawl_id]},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["deleted"] == 1
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 404
|
||||
|
||||
|
||||
def test_delete_crawls_org_owner(
|
||||
admin_auth_headers,
|
||||
crawler_auth_headers,
|
||||
default_org_id,
|
||||
admin_crawl_id,
|
||||
crawler_crawl_id,
|
||||
wr_specs_crawl_id,
|
||||
):
|
||||
# Test that org owner can delete own crawl
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
||||
headers=admin_auth_headers,
|
||||
json={"crawl_ids": [admin_crawl_id]},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["deleted"]
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 404
|
||||
|
||||
# Test that org owner can delete another org user's crawls
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
||||
headers=admin_auth_headers,
|
||||
json={"crawl_ids": [wr_specs_crawl_id]},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["deleted"] == 1
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{wr_specs_crawl_id}",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 404
|
||||
|
@ -41,3 +41,67 @@ def default_org_id(admin_auth_headers):
|
||||
except:
|
||||
print("Waiting for default org id")
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def crawl_id_wr(admin_auth_headers, default_org_id):
|
||||
# Start crawl.
|
||||
crawl_data = {
|
||||
"runNow": True,
|
||||
"name": "Webrecorder admin test crawl",
|
||||
"tags": ["wr", "nightly testing"],
|
||||
"config": {
|
||||
"seeds": ["https://webrecorder.net/"],
|
||||
"limit": 1,
|
||||
},
|
||||
}
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||
headers=admin_auth_headers,
|
||||
json=crawl_data,
|
||||
)
|
||||
data = r.json()
|
||||
|
||||
crawl_id = data["run_now_job"]
|
||||
# Wait for it to complete and then return crawl ID
|
||||
while True:
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
if data["state"] == "complete":
|
||||
return crawl_id
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def crawl_id_wr_specs(admin_auth_headers, default_org_id):
|
||||
# Start crawl.
|
||||
crawl_data = {
|
||||
"runNow": True,
|
||||
"name": "Webrecorder Specs admin test crawl",
|
||||
"tags": ["wr-specs", "nightly testing"],
|
||||
"config": {
|
||||
"seeds": ["https://specs.webrecorder.net/"],
|
||||
"limit": 1,
|
||||
},
|
||||
}
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||
headers=admin_auth_headers,
|
||||
json=crawl_data,
|
||||
)
|
||||
data = r.json()
|
||||
|
||||
crawl_id = data["run_now_job"]
|
||||
# Wait for it to complete and then return crawl ID
|
||||
while True:
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
if data["state"] == "complete":
|
||||
return crawl_id
|
||||
time.sleep(5)
|
||||
|
88
backend/test_nightly/test_delete_crawls.py
Normal file
88
backend/test_nightly/test_delete_crawls.py
Normal file
@ -0,0 +1,88 @@
|
||||
import os
|
||||
import requests
|
||||
import time
|
||||
|
||||
from .conftest import API_PREFIX, HOST_PREFIX
|
||||
|
||||
|
||||
def test_delete_crawls(
|
||||
tmp_path, admin_auth_headers, default_org_id, crawl_id_wr, crawl_id_wr_specs
|
||||
):
|
||||
# Check that crawls have associated files
|
||||
crawl_resource_urls = []
|
||||
|
||||
def _file_is_retrievable(url):
|
||||
"""Attempt to retrieve file at url and return True or False."""
|
||||
file_path = str(tmp_path / "test_download")
|
||||
if os.path.exists(file_path):
|
||||
os.remove(file_path)
|
||||
|
||||
r = requests.get(f"{HOST_PREFIX}{url}")
|
||||
if not r.status_code == 200:
|
||||
return False
|
||||
|
||||
with open(file_path, "wb") as fd:
|
||||
fd.write(r.content)
|
||||
|
||||
if not (os.path.isfile(file_path) and os.path.getsize(file_path) > 0):
|
||||
return False
|
||||
|
||||
os.remove(file_path)
|
||||
return True
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
resources = data["resources"]
|
||||
assert resources
|
||||
for resource in resources:
|
||||
crawl_resource_urls.append(resource["path"])
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
resources = data["resources"]
|
||||
assert resources
|
||||
for resource in resources:
|
||||
crawl_resource_urls.append(resource["path"])
|
||||
|
||||
# Test retrieving resources
|
||||
for url in crawl_resource_urls:
|
||||
assert _file_is_retrievable(url)
|
||||
|
||||
# Delete crawls
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
||||
headers=admin_auth_headers,
|
||||
json={"crawl_ids": [crawl_id_wr, crawl_id_wr_specs]},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["deleted"] == 2
|
||||
|
||||
# Verify that crawls don't exist in db
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 404
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 404
|
||||
|
||||
# Give Minio time to delete the files
|
||||
time.sleep(120)
|
||||
|
||||
# Verify that files are no longer retrievable from storage
|
||||
for url in crawl_resource_urls:
|
||||
assert not _file_is_retrievable(url)
|
Loading…
Reference in New Issue
Block a user