diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index f2447741..94026128 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -18,7 +18,7 @@ import pymongo from .db import BaseMongoModel from .users import User from .orgs import Organization, MAX_CRAWL_SCALE -from .storages import get_presigned_url +from .storages import get_presigned_url, delete_crawl_file_object # ============================================================================ @@ -352,13 +352,26 @@ class CrawlOps: for update in updates: await self.crawls.find_one_and_update(*update) - async def delete_crawls(self, oid: uuid.UUID, delete_list: DeleteCrawlList): + async def delete_crawls(self, org: Organization, delete_list: DeleteCrawlList): """Delete a list of crawls by id for given org""" + for crawl_id in delete_list.crawl_ids: + await self._delete_crawl_files(org, crawl_id) + res = await self.crawls.delete_many( - {"_id": {"$in": delete_list.crawl_ids}, "oid": oid} + {"_id": {"$in": delete_list.crawl_ids}, "oid": org.id} ) + return res.deleted_count + async def _delete_crawl_files(self, org: Organization, crawl_id: str): + """Delete files associated with crawl from storage.""" + crawl_raw = await self.get_crawl_raw(crawl_id, org) + crawl = Crawl.from_dict(crawl_raw) + for file_ in crawl.files: + status_code = await delete_crawl_file_object(org, file_, self.crawl_manager) + if status_code != 204: + raise HTTPException(status_code=400, detail="file_deletion_error") + async def add_new_crawl(self, crawl_id: str, crawlconfig): """initialize new crawl""" crawl = Crawl( @@ -651,17 +664,29 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user @app.post("/orgs/{oid}/crawls/delete", tags=["crawls"]) async def delete_crawls( - delete_list: DeleteCrawlList, org: Organization = Depends(org_crawl_dep) + delete_list: DeleteCrawlList, + user: User = Depends(user_dep), + org: Organization = Depends(org_crawl_dep), ): - try: - for crawl_id in delete_list: - await crawl_manager.stop_crawl(crawl_id, org.id, graceful=False) + # Ensure user has appropriate permissions for all crawls in list: + # - Crawler users can delete their own crawls + # - Org owners can delete any crawls in org + for crawl_id in delete_list.crawl_ids: + crawl_raw = await ops.get_crawl_raw(crawl_id, org) + crawl = Crawl.from_dict(crawl_raw) + if (crawl.userid != user.id) and not org.is_owner(user): + raise HTTPException(status_code=403, detail="Not Allowed") - except Exception as exc: - # pylint: disable=raise-missing-from - raise HTTPException(status_code=400, detail=f"Error Stopping Crawl: {exc}") + if not crawl.finished: + try: + await ops.shutdown_crawl(crawl_id, org, graceful=False) + except Exception as exc: + # pylint: disable=raise-missing-from + raise HTTPException( + status_code=400, detail=f"Error Stopping Crawl: {exc}" + ) - res = await ops.delete_crawls(org.id, delete_list) + res = await ops.delete_crawls(org, delete_list) return {"deleted": res} diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index bf203222..3386663e 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -122,3 +122,29 @@ async def get_presigned_url(org, crawlfile, crawl_manager, duration=3600): ) return presigned_url + + +# ============================================================================ +async def delete_crawl_file_object(org, crawlfile, crawl_manager): + """delete crawl file from storage.""" + status_code = None + + if crawlfile.def_storage_name: + s3storage = await crawl_manager.get_default_storage(crawlfile.def_storage_name) + + elif org.storage.type == "s3": + s3storage = org.storage + + else: + raise TypeError("No Default Storage Found, Invalid Storage Type") + + async with get_s3_client(s3storage, s3storage.use_access_for_presign) as ( + client, + bucket, + key, + ): + key += crawlfile.filename + response = await client.delete_object(Bucket=bucket, Key=key) + status_code = response["ResponseMetadata"]["HTTPStatusCode"] + + return status_code diff --git a/backend/test/conftest.py b/backend/test/conftest.py index aba76d53..6a752da0 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -202,6 +202,34 @@ def crawler_crawl_id(crawler_auth_headers, default_org_id): time.sleep(5) +@pytest.fixture(scope="session") +def wr_specs_crawl_id(crawler_auth_headers, default_org_id): + # Start crawl. + crawl_data = { + "runNow": True, + "name": "Webrecorder Specs sample crawl", + "config": {"seeds": ["https://specs.webrecorder.net/"], "limit": 1}, + } + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=crawl_data, + ) + data = r.json() + + crawl_id = data["run_now_job"] + # Wait for it to complete and then return crawl ID + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", + headers=crawler_auth_headers, + ) + data = r.json() + if data["state"] == "complete": + return crawl_id + time.sleep(5) + + @pytest.fixture(scope="session") def crawler_config_id(crawler_crawl_id): return _crawler_config_id diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 0e7b1d09..f3b19548 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -150,3 +150,74 @@ def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id): data = r.json() assert data["tags"] == [] assert not data["notes"] + + +def test_delete_crawls_crawler( + crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id +): + # Test that crawler user can't delete another user's crawls + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete", + headers=crawler_auth_headers, + json={"crawl_ids": [admin_crawl_id]}, + ) + assert r.status_code == 403 + data = r.json() + assert data["detail"] == "Not Allowed" + + # Test that crawler user can delete own crawl + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete", + headers=crawler_auth_headers, + json={"crawl_ids": [crawler_crawl_id]}, + ) + assert r.status_code == 200 + data = r.json() + assert data["deleted"] == 1 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 404 + + +def test_delete_crawls_org_owner( + admin_auth_headers, + crawler_auth_headers, + default_org_id, + admin_crawl_id, + crawler_crawl_id, + wr_specs_crawl_id, +): + # Test that org owner can delete own crawl + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete", + headers=admin_auth_headers, + json={"crawl_ids": [admin_crawl_id]}, + ) + assert r.status_code == 200 + data = r.json() + assert data["deleted"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 404 + + # Test that org owner can delete another org user's crawls + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete", + headers=admin_auth_headers, + json={"crawl_ids": [wr_specs_crawl_id]}, + ) + assert r.status_code == 200 + data = r.json() + assert data["deleted"] == 1 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{wr_specs_crawl_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 404 diff --git a/backend/test_nightly/conftest.py b/backend/test_nightly/conftest.py index d78b00d5..dfa39a62 100644 --- a/backend/test_nightly/conftest.py +++ b/backend/test_nightly/conftest.py @@ -41,3 +41,67 @@ def default_org_id(admin_auth_headers): except: print("Waiting for default org id") time.sleep(5) + + +@pytest.fixture(scope="session") +def crawl_id_wr(admin_auth_headers, default_org_id): + # Start crawl. + crawl_data = { + "runNow": True, + "name": "Webrecorder admin test crawl", + "tags": ["wr", "nightly testing"], + "config": { + "seeds": ["https://webrecorder.net/"], + "limit": 1, + }, + } + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=admin_auth_headers, + json=crawl_data, + ) + data = r.json() + + crawl_id = data["run_now_job"] + # Wait for it to complete and then return crawl ID + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", + headers=admin_auth_headers, + ) + data = r.json() + if data["state"] == "complete": + return crawl_id + time.sleep(5) + + +@pytest.fixture(scope="session") +def crawl_id_wr_specs(admin_auth_headers, default_org_id): + # Start crawl. + crawl_data = { + "runNow": True, + "name": "Webrecorder Specs admin test crawl", + "tags": ["wr-specs", "nightly testing"], + "config": { + "seeds": ["https://specs.webrecorder.net/"], + "limit": 1, + }, + } + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=admin_auth_headers, + json=crawl_data, + ) + data = r.json() + + crawl_id = data["run_now_job"] + # Wait for it to complete and then return crawl ID + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", + headers=admin_auth_headers, + ) + data = r.json() + if data["state"] == "complete": + return crawl_id + time.sleep(5) diff --git a/backend/test_nightly/test_delete_crawls.py b/backend/test_nightly/test_delete_crawls.py new file mode 100644 index 00000000..2e9fabf5 --- /dev/null +++ b/backend/test_nightly/test_delete_crawls.py @@ -0,0 +1,88 @@ +import os +import requests +import time + +from .conftest import API_PREFIX, HOST_PREFIX + + +def test_delete_crawls( + tmp_path, admin_auth_headers, default_org_id, crawl_id_wr, crawl_id_wr_specs +): + # Check that crawls have associated files + crawl_resource_urls = [] + + def _file_is_retrievable(url): + """Attempt to retrieve file at url and return True or False.""" + file_path = str(tmp_path / "test_download") + if os.path.exists(file_path): + os.remove(file_path) + + r = requests.get(f"{HOST_PREFIX}{url}") + if not r.status_code == 200: + return False + + with open(file_path, "wb") as fd: + fd.write(r.content) + + if not (os.path.isfile(file_path) and os.path.getsize(file_path) > 0): + return False + + os.remove(file_path) + return True + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + resources = data["resources"] + assert resources + for resource in resources: + crawl_resource_urls.append(resource["path"]) + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + resources = data["resources"] + assert resources + for resource in resources: + crawl_resource_urls.append(resource["path"]) + + # Test retrieving resources + for url in crawl_resource_urls: + assert _file_is_retrievable(url) + + # Delete crawls + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete", + headers=admin_auth_headers, + json={"crawl_ids": [crawl_id_wr, crawl_id_wr_specs]}, + ) + assert r.status_code == 200 + data = r.json() + + assert data["deleted"] == 2 + + # Verify that crawls don't exist in db + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json", + headers=admin_auth_headers, + ) + assert r.status_code == 404 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json", + headers=admin_auth_headers, + ) + assert r.status_code == 404 + + # Give Minio time to delete the files + time.sleep(120) + + # Verify that files are no longer retrievable from storage + for url in crawl_resource_urls: + assert not _file_is_retrievable(url)