Fix POST /orgs/{oid}/crawls/delete (#591)

* Fix POST /orgs/{oid}/crawls/delete - Add permissions check to ensure crawler users can only delete their own crawls - Fix broken delete_crawls endpoint - Delete files from storage as well as deleting crawl from db - Add tests, including nightly test that ensures crawl files are no longer accessible after the crawl is deleted
2023-02-15 21:06:12 -05:00 · 2023-02-15 21:06:12 -05:00 · bd4fba7af7
commit bd4fba7af7
parent 9532f48515
6 changed files with 313 additions and 11 deletions
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -18,7 +18,7 @@ import pymongo
 from .db import BaseMongoModel
 from .users import User
 from .orgs import Organization, MAX_CRAWL_SCALE
-from .storages import get_presigned_url
+from .storages import get_presigned_url, delete_crawl_file_object


 # ============================================================================
@ -352,13 +352,26 @@ class CrawlOps:
        for update in updates:
            await self.crawls.find_one_and_update(*update)

-    async def delete_crawls(self, oid: uuid.UUID, delete_list: DeleteCrawlList):
+    async def delete_crawls(self, org: Organization, delete_list: DeleteCrawlList):
        """Delete a list of crawls by id for given org"""
+        for crawl_id in delete_list.crawl_ids:
+            await self._delete_crawl_files(org, crawl_id)
+
        res = await self.crawls.delete_many(
-            {"_id": {"$in": delete_list.crawl_ids}, "oid": oid}
+            {"_id": {"$in": delete_list.crawl_ids}, "oid": org.id}
        )
+
        return res.deleted_count

+    async def _delete_crawl_files(self, org: Organization, crawl_id: str):
+        """Delete files associated with crawl from storage."""
+        crawl_raw = await self.get_crawl_raw(crawl_id, org)
+        crawl = Crawl.from_dict(crawl_raw)
+        for file_ in crawl.files:
+            status_code = await delete_crawl_file_object(org, file_, self.crawl_manager)
+            if status_code != 204:
+                raise HTTPException(status_code=400, detail="file_deletion_error")
+
    async def add_new_crawl(self, crawl_id: str, crawlconfig):
        """initialize new crawl"""
        crawl = Crawl(
@ -651,17 +664,29 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user

    @app.post("/orgs/{oid}/crawls/delete", tags=["crawls"])
    async def delete_crawls(
-        delete_list: DeleteCrawlList, org: Organization = Depends(org_crawl_dep)
+        delete_list: DeleteCrawlList,
+        user: User = Depends(user_dep),
+        org: Organization = Depends(org_crawl_dep),
    ):
-        try:
-            for crawl_id in delete_list:
-                await crawl_manager.stop_crawl(crawl_id, org.id, graceful=False)
+        # Ensure user has appropriate permissions for all crawls in list:
+        # - Crawler users can delete their own crawls
+        # - Org owners can delete any crawls in org
+        for crawl_id in delete_list.crawl_ids:
+            crawl_raw = await ops.get_crawl_raw(crawl_id, org)
+            crawl = Crawl.from_dict(crawl_raw)
+            if (crawl.userid != user.id) and not org.is_owner(user):
+                raise HTTPException(status_code=403, detail="Not Allowed")

-        except Exception as exc:
-            # pylint: disable=raise-missing-from
-            raise HTTPException(status_code=400, detail=f"Error Stopping Crawl: {exc}")
+            if not crawl.finished:
+                try:
+                    await ops.shutdown_crawl(crawl_id, org, graceful=False)
+                except Exception as exc:
+                    # pylint: disable=raise-missing-from
+                    raise HTTPException(
+                        status_code=400, detail=f"Error Stopping Crawl: {exc}"
+                    )

-        res = await ops.delete_crawls(org.id, delete_list)
+        res = await ops.delete_crawls(org, delete_list)

        return {"deleted": res}

--- a/backend/btrixcloud/storages.py
+++ b/backend/btrixcloud/storages.py
@ -122,3 +122,29 @@ async def get_presigned_url(org, crawlfile, crawl_manager, duration=3600):
            )

    return presigned_url
+
+
+# ============================================================================
+async def delete_crawl_file_object(org, crawlfile, crawl_manager):
+    """delete crawl file from storage."""
+    status_code = None
+
+    if crawlfile.def_storage_name:
+        s3storage = await crawl_manager.get_default_storage(crawlfile.def_storage_name)
+
+    elif org.storage.type == "s3":
+        s3storage = org.storage
+
+    else:
+        raise TypeError("No Default Storage Found, Invalid Storage Type")
+
+    async with get_s3_client(s3storage, s3storage.use_access_for_presign) as (
+        client,
+        bucket,
+        key,
+    ):
+        key += crawlfile.filename
+        response = await client.delete_object(Bucket=bucket, Key=key)
+        status_code = response["ResponseMetadata"]["HTTPStatusCode"]
+
+    return status_code
--- a/backend/test/conftest.py
+++ b/backend/test/conftest.py
@ -202,6 +202,34 @@ def crawler_crawl_id(crawler_auth_headers, default_org_id):
        time.sleep(5)


+@pytest.fixture(scope="session")
+def wr_specs_crawl_id(crawler_auth_headers, default_org_id):
+    # Start crawl.
+    crawl_data = {
+        "runNow": True,
+        "name": "Webrecorder Specs sample crawl",
+        "config": {"seeds": ["https://specs.webrecorder.net/"], "limit": 1},
+    }
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=crawler_auth_headers,
+        json=crawl_data,
+    )
+    data = r.json()
+
+    crawl_id = data["run_now_job"]
+    # Wait for it to complete and then return crawl ID
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=crawler_auth_headers,
+        )
+        data = r.json()
+        if data["state"] == "complete":
+            return crawl_id
+        time.sleep(5)
+
+
@pytest.fixture(scope="session")
 def crawler_config_id(crawler_crawl_id):
    return _crawler_config_id
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@ -150,3 +150,74 @@ def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
    data = r.json()
    assert data["tags"] == []
    assert not data["notes"]
+
+
+def test_delete_crawls_crawler(
+    crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
+):
+    # Test that crawler user can't delete another user's crawls
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
+        headers=crawler_auth_headers,
+        json={"crawl_ids": [admin_crawl_id]},
+    )
+    assert r.status_code == 403
+    data = r.json()
+    assert data["detail"] == "Not Allowed"
+
+    # Test that crawler user can delete own crawl
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
+        headers=crawler_auth_headers,
+        json={"crawl_ids": [crawler_crawl_id]},
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["deleted"] == 1
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 404
+
+
+def test_delete_crawls_org_owner(
+    admin_auth_headers,
+    crawler_auth_headers,
+    default_org_id,
+    admin_crawl_id,
+    crawler_crawl_id,
+    wr_specs_crawl_id,
+):
+    # Test that org owner can delete own crawl
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
+        headers=admin_auth_headers,
+        json={"crawl_ids": [admin_crawl_id]},
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["deleted"]
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 404
+
+    # Test that org owner can delete another org user's crawls
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
+        headers=admin_auth_headers,
+        json={"crawl_ids": [wr_specs_crawl_id]},
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["deleted"] == 1
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{wr_specs_crawl_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 404
--- a/backend/test_nightly/conftest.py
+++ b/backend/test_nightly/conftest.py
@ -41,3 +41,67 @@ def default_org_id(admin_auth_headers):
        except:
            print("Waiting for default org id")
            time.sleep(5)
+
+
+@pytest.fixture(scope="session")
+def crawl_id_wr(admin_auth_headers, default_org_id):
+    # Start crawl.
+    crawl_data = {
+        "runNow": True,
+        "name": "Webrecorder admin test crawl",
+        "tags": ["wr", "nightly testing"],
+        "config": {
+            "seeds": ["https://webrecorder.net/"],
+            "limit": 1,
+        },
+    }
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=admin_auth_headers,
+        json=crawl_data,
+    )
+    data = r.json()
+
+    crawl_id = data["run_now_job"]
+    # Wait for it to complete and then return crawl ID
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=admin_auth_headers,
+        )
+        data = r.json()
+        if data["state"] == "complete":
+            return crawl_id
+        time.sleep(5)
+
+
+@pytest.fixture(scope="session")
+def crawl_id_wr_specs(admin_auth_headers, default_org_id):
+    # Start crawl.
+    crawl_data = {
+        "runNow": True,
+        "name": "Webrecorder Specs admin test crawl",
+        "tags": ["wr-specs", "nightly testing"],
+        "config": {
+            "seeds": ["https://specs.webrecorder.net/"],
+            "limit": 1,
+        },
+    }
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=admin_auth_headers,
+        json=crawl_data,
+    )
+    data = r.json()
+
+    crawl_id = data["run_now_job"]
+    # Wait for it to complete and then return crawl ID
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=admin_auth_headers,
+        )
+        data = r.json()
+        if data["state"] == "complete":
+            return crawl_id
+        time.sleep(5)
--- a/backend/test_nightly/test_delete_crawls.py
+++ b/backend/test_nightly/test_delete_crawls.py
@ -0,0 +1,88 @@
+import os
+import requests
+import time
+
+from .conftest import API_PREFIX, HOST_PREFIX
+
+
+def test_delete_crawls(
+    tmp_path, admin_auth_headers, default_org_id, crawl_id_wr, crawl_id_wr_specs
+):
+    # Check that crawls have associated files
+    crawl_resource_urls = []
+
+    def _file_is_retrievable(url):
+        """Attempt to retrieve file at url and return True or False."""
+        file_path = str(tmp_path / "test_download")
+        if os.path.exists(file_path):
+            os.remove(file_path)
+
+        r = requests.get(f"{HOST_PREFIX}{url}")
+        if not r.status_code == 200:
+            return False
+
+        with open(file_path, "wb") as fd:
+            fd.write(r.content)
+
+        if not (os.path.isfile(file_path) and os.path.getsize(file_path) > 0):
+            return False
+
+        os.remove(file_path)
+        return True
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    resources = data["resources"]
+    assert resources
+    for resource in resources:
+        crawl_resource_urls.append(resource["path"])
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    resources = data["resources"]
+    assert resources
+    for resource in resources:
+        crawl_resource_urls.append(resource["path"])
+
+    # Test retrieving resources
+    for url in crawl_resource_urls:
+        assert _file_is_retrievable(url)
+
+    # Delete crawls
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
+        headers=admin_auth_headers,
+        json={"crawl_ids": [crawl_id_wr, crawl_id_wr_specs]},
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    assert data["deleted"] == 2
+
+    # Verify that crawls don't exist in db
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 404
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 404
+
+    # Give Minio time to delete the files
+    time.sleep(120)
+
+    # Verify that files are no longer retrievable from storage
+    for url in crawl_resource_urls:
+        assert not _file_is_retrievable(url)