API delete endpoint improvements (#1232)

- Applies user permissions check before deleting anything in all /delete endpoints
- Shuts down running crawls before deleting anything in /all-crawls/delete as well as /crawls/delete
- Splits delete_list.crawl_ids into crawls and upload lists at same time as checks in /all-crawls/delete
- Updates frontend notification message to Only org owners can delete other users' archived items. when a crawler user attempts to delete another users' archived items
This commit is contained in:
Tessa Walsh 2023-10-03 16:05:00 -04:00 committed by GitHub
parent df190e12b9
commit e9bac4c088
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 187 additions and 128 deletions

View File

@ -186,8 +186,75 @@ class BaseCrawlOps:
return {"updated": True}
async def update_crawl_state(self, crawl_id: str, state: str):
"""called only when job container is being stopped/canceled"""
data = {"state": state}
# if cancelation, set the finish time here
if state == "canceled":
data["finished"] = dt_now()
await self.crawls.find_one_and_update(
{
"_id": crawl_id,
"type": "crawl",
"state": {"$in": RUNNING_AND_STARTING_STATES},
},
{"$set": data},
)
async def shutdown_crawl(self, crawl_id: str, org: Organization, graceful: bool):
"""stop or cancel specified crawl"""
crawl = await self.get_crawl_raw(crawl_id, org)
if crawl.get("type") != "crawl":
return
result = None
try:
result = await self.crawl_manager.shutdown_crawl(
crawl_id, org.id_str, graceful=graceful
)
if result.get("success"):
if graceful:
await self.crawls.find_one_and_update(
{"_id": crawl_id, "type": "crawl", "oid": org.id},
{"$set": {"stopping": True}},
)
return result
except Exception as exc:
# pylint: disable=raise-missing-from
# if reached here, probably crawl doesn't exist anymore
raise HTTPException(
status_code=404, detail=f"crawl_not_found, (details: {exc})"
)
# if job no longer running, canceling is considered success,
# but graceful stoppage is not possible, so would be a failure
if result.get("error") == "Not Found":
if not graceful:
await self.update_crawl_state(crawl_id, "canceled")
crawl = await self.get_crawl_raw(crawl_id, org)
if not await self.crawl_configs.stats_recompute_last(
crawl["cid"], 0, -1
):
raise HTTPException(
status_code=404,
detail=f"crawl_config_not_found: {crawl['cid']}",
)
return {"success": True}
# return whatever detail may be included in the response
raise HTTPException(status_code=400, detail=result)
async def delete_crawls(
self, org: Organization, delete_list: DeleteCrawlList, type_: str
self,
org: Organization,
delete_list: DeleteCrawlList,
type_: str,
user: Optional[User] = None,
):
"""Delete a list of crawls by id for given org"""
cids_to_update: dict[str, dict[str, int]] = {}
@ -199,6 +266,21 @@ class BaseCrawlOps:
if crawl.get("type") != type_:
continue
# Ensure user has appropriate permissions for all crawls in list:
# - Crawler users can delete their own crawls
# - Org owners can delete any crawls in org
if user and (crawl.get("userid") != user.id) and not org.is_owner(user):
raise HTTPException(status_code=403, detail="not_allowed")
if type_ == "crawl" and not crawl.get("finished"):
try:
await self.shutdown_crawl(crawl_id, org, graceful=False)
except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(
status_code=400, detail=f"Error Stopping Crawl: {exc}"
)
crawl_size = await self._delete_crawl_files(crawl, org)
size += crawl_size
@ -486,24 +568,37 @@ class BaseCrawlOps:
return crawls, total
async def delete_crawls_all_types(
self, delete_list: DeleteCrawlList, org: Organization
self,
delete_list: DeleteCrawlList,
org: Organization,
user: Optional[User] = None,
):
"""Delete uploaded crawls"""
if len(delete_list.crawl_ids) == 0:
crawls: list[str] = []
uploads: list[str] = []
for crawl_id in delete_list.crawl_ids:
crawl = await self.get_crawl_raw(crawl_id, org)
type_ = crawl.get("type")
if type_ == "crawl":
crawls.append(crawl_id)
if type_ == "upload":
uploads.append(crawl_id)
crawls_length = len(crawls)
uploads_length = len(uploads)
if crawls_length + uploads_length == 0:
raise HTTPException(status_code=400, detail="nothing_to_delete")
deleted_count = 0
# Value is set in delete calls, but initialize to keep linter happy.
quota_reached = False
crawls_to_delete, uploads_to_delete = await self._split_delete_list_by_type(
delete_list, org
)
if len(crawls_to_delete) > 0:
crawl_delete_list = DeleteCrawlList(crawl_ids=crawls_to_delete)
if crawls_length:
crawl_delete_list = DeleteCrawlList(crawl_ids=crawls)
deleted, cids_to_update, quota_reached = await self.delete_crawls(
org, crawl_delete_list, "crawl"
org, crawl_delete_list, "crawl", user
)
deleted_count += deleted
@ -512,10 +607,10 @@ class BaseCrawlOps:
cid_inc = cid_dict["inc"]
await self.crawl_configs.stats_recompute_last(cid, -cid_size, -cid_inc)
if len(uploads_to_delete) > 0:
upload_delete_list = DeleteCrawlList(crawl_ids=uploads_to_delete)
if uploads_length:
upload_delete_list = DeleteCrawlList(crawl_ids=uploads)
deleted, _, quota_reached = await self.delete_crawls(
org, upload_delete_list, "upload"
org, upload_delete_list, "upload", user
)
deleted_count += deleted
@ -524,26 +619,6 @@ class BaseCrawlOps:
return {"deleted": True, "storageQuotaReached": quota_reached}
async def _split_delete_list_by_type(
self, delete_list: DeleteCrawlList, org: Organization
):
"""Return separate crawl and upload arrays from mixed input"""
crawls: list[str] = []
uploads: list[str] = []
for crawl_id in delete_list.crawl_ids:
try:
crawl_raw = await self.get_crawl_raw(crawl_id, org)
crawl_type = crawl_raw.get("type")
if crawl_type == "crawl":
crawls.append(crawl_id)
elif crawl_type == "upload":
uploads.append(crawl_id)
# pylint: disable=broad-exception-caught
except Exception as err:
print(err, flush=True)
return crawls, uploads
async def get_all_crawl_search_values(
self, org: Organization, type_: Optional[str] = None
):
@ -690,6 +765,7 @@ def init_base_crawls_api(
@app.post("/orgs/{oid}/all-crawls/delete", tags=["all-crawls"])
async def delete_crawls_all_types(
delete_list: DeleteCrawlList,
user: User = Depends(user_dep),
org: Organization = Depends(org_crawl_dep),
):
return await ops.delete_crawls_all_types(delete_list, org)
return await ops.delete_crawls_all_types(delete_list, org, user)

View File

@ -203,12 +203,16 @@ class CrawlOps(BaseCrawlOps):
return crawls, total
async def delete_crawls(
self, org: Organization, delete_list: DeleteCrawlList, type_="crawl"
self,
org: Organization,
delete_list: DeleteCrawlList,
type_="crawl",
user: Optional[User] = None,
):
"""Delete a list of crawls by id for given org"""
count, cids_to_update, quota_reached = await super().delete_crawls(
org, delete_list, type_
org, delete_list, type_, user
)
if count < 1:
@ -295,65 +299,6 @@ class CrawlOps(BaseCrawlOps):
return True
async def update_crawl_state(self, crawl_id: str, state: str):
"""called only when job container is being stopped/canceled"""
data = {"state": state}
# if cancelation, set the finish time here
if state == "canceled":
data["finished"] = dt_now()
await self.crawls.find_one_and_update(
{
"_id": crawl_id,
"type": "crawl",
"state": {"$in": RUNNING_AND_STARTING_STATES},
},
{"$set": data},
)
async def shutdown_crawl(self, crawl_id: str, org: Organization, graceful: bool):
"""stop or cancel specified crawl"""
result = None
try:
result = await self.crawl_manager.shutdown_crawl(
crawl_id, org.id_str, graceful=graceful
)
if result.get("success"):
if graceful:
await self.crawls.find_one_and_update(
{"_id": crawl_id, "type": "crawl", "oid": org.id},
{"$set": {"stopping": True}},
)
return result
except Exception as exc:
# pylint: disable=raise-missing-from
# if reached here, probably crawl doesn't exist anymore
raise HTTPException(
status_code=404, detail=f"crawl_not_found, (details: {exc})"
)
# if job no longer running, canceling is considered success,
# but graceful stoppage is not possible, so would be a failure
if result.get("error") == "Not Found":
if not graceful:
await self.update_crawl_state(crawl_id, "canceled")
crawl = await self.get_crawl_raw(crawl_id, org)
if not await self.crawl_configs.stats_recompute_last(
crawl["cid"], 0, -1
):
raise HTTPException(
status_code=404,
detail=f"crawl_config_not_found: {crawl['cid']}",
)
return {"success": True}
# return whatever detail may be included in the response
raise HTTPException(status_code=400, detail=result)
async def _crawl_queue_len(self, redis, key):
try:
return await redis.zcard(key)
@ -774,25 +719,7 @@ def init_crawls_api(
user: User = Depends(user_dep),
org: Organization = Depends(org_crawl_dep),
):
# Ensure user has appropriate permissions for all crawls in list:
# - Crawler users can delete their own crawls
# - Org owners can delete any crawls in org
for crawl_id in delete_list.crawl_ids:
crawl_raw = await ops.get_crawl_raw(crawl_id, org)
crawl = Crawl.from_dict(crawl_raw)
if (crawl.userid != user.id) and not org.is_owner(user):
raise HTTPException(status_code=403, detail="Not Allowed")
if not crawl.finished:
try:
await ops.shutdown_crawl(crawl_id, org, graceful=False)
except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(
status_code=400, detail=f"Error Stopping Crawl: {exc}"
)
return await ops.delete_crawls(org, delete_list)
return await ops.delete_crawls(org, delete_list, "crawl", user)
@app.get(
"/orgs/all/crawls/{crawl_id}/replay.json",

View File

@ -187,10 +187,15 @@ class UploadOps(BaseCrawlOps):
return {"id": crawl_id, "added": True, "storageQuotaReached": quota_reached}
async def delete_uploads(self, delete_list: DeleteCrawlList, org: Organization):
async def delete_uploads(
self,
delete_list: DeleteCrawlList,
org: Organization,
user: Optional[User] = None,
):
"""Delete uploaded crawls"""
deleted_count, _, quota_reached = await self.delete_crawls(
org, delete_list, "upload"
org, delete_list, "upload", user
)
if deleted_count < 1:
@ -399,6 +404,7 @@ def init_uploads_api(
@app.post("/orgs/{oid}/uploads/delete", tags=["uploads"])
async def delete_uploads(
delete_list: DeleteCrawlList,
user: User = Depends(user_dep),
org: Organization = Depends(org_crawl_dep),
):
return await ops.delete_uploads(delete_list, org)
return await ops.delete_uploads(delete_list, org, user)

View File

@ -308,7 +308,7 @@ def test_delete_crawls_crawler(
)
assert r.status_code == 403
data = r.json()
assert data["detail"] == "Not Allowed"
assert data["detail"] == "not_allowed"
# Test that crawler user can delete own crawl
r = requests.post(

View File

@ -302,7 +302,17 @@ def test_update_upload_metadata(admin_auth_headers, default_org_id):
assert data["collectionIds"] == UPDATED_COLLECTION_IDS
def test_delete_stream_upload(admin_auth_headers, default_org_id):
def test_delete_stream_upload(admin_auth_headers, crawler_auth_headers, default_org_id):
# Verify non-admin user who didn't upload crawl can't delete it
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete",
headers=crawler_auth_headers,
json={"crawl_ids": [upload_id]},
)
assert r.status_code == 403
assert r.json()["detail"] == "not_allowed"
# Verify user who created upload can delete it
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete",
headers=admin_auth_headers,
@ -842,6 +852,7 @@ def test_update_upload_metadata_all_crawls(admin_auth_headers, default_org_id):
def test_delete_form_upload_and_crawls_from_all_crawls(
admin_auth_headers,
crawler_auth_headers,
default_org_id,
all_crawls_delete_crawl_ids,
all_crawls_delete_config_id,
@ -890,6 +901,15 @@ def test_delete_form_upload_and_crawls_from_all_crawls(
combined_crawl_size = crawl_1_size + crawl_2_size
total_size = combined_crawl_size + upload_size
# Verify that non-admin user can't delete another's items
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/delete",
headers=crawler_auth_headers,
json={"crawl_ids": crawls_to_delete},
)
assert r.status_code == 403
assert r.json()["detail"] == "not_allowed"
# Delete mixed type archived items
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/delete",

View File

@ -1024,10 +1024,20 @@ ${this.crawl?.description}
icon: "check2-circle",
});
} catch (e: any) {
let message = msg(
str`Sorry, couldn't delete archived item at this time.`
);
if (e.isApiError) {
if (e.details == "not_allowed") {
message = msg(
str`Only org owners can delete other users' archived items.`
);
} else if (e.message) {
message = e.message;
}
}
this.notify({
message:
(e.isApiError && e.message) ||
msg("Sorry, couldn't run crawl at this time."),
message: message,
variant: "danger",
icon: "exclamation-octagon",
});

View File

@ -719,10 +719,20 @@ export class CrawlsList extends LiteElement {
});
this.fetchArchivedItems();
} catch (e: any) {
let message = msg(
str`Sorry, couldn't delete archived item at this time.`
);
if (e.isApiError) {
if (e.details == "not_allowed") {
message = msg(
str`Only org owners can delete other users' archived items.`
);
} else if (e.message) {
message = e.message;
}
}
this.notify({
message:
(e.isApiError && e.message) ||
msg(str`Sorry, couldn't delete archived item at this time.`),
message: message,
variant: "danger",
icon: "exclamation-octagon",
});

View File

@ -1626,10 +1626,20 @@ export class WorkflowDetail extends LiteElement {
});
this.fetchCrawls();
} catch (e: any) {
let message = msg(
str`Sorry, couldn't delete archived item at this time.`
);
if (e.isApiError) {
if (e.details == "not_allowed") {
message = msg(
str`Only org owners can delete other users' archived items.`
);
} else if (e.message) {
message = e.message;
}
}
this.notify({
message:
(e.isApiError && e.message) ||
msg("Sorry, couldn't run crawl at this time."),
message: message,
variant: "danger",
icon: "exclamation-octagon",
});