crawls list: unset errors in crawls list response to avoid very large… (#904)

* crawls list: unset errors in crawls list response to avoid very large responses #872 * Remove errors from crawl replay.json * Add tests to ensure errors are excluded from crawl GET endpoints * Update tests to accept None for errors --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-06-02 18:52:59 -07:00 · 2023-06-02 18:52:59 -07:00 · 3f42515914
commit 3f42515914
parent 0284903b34
2 changed files with 33 additions and 1 deletions
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -280,7 +280,7 @@ class CrawlOps:
            {"$set": {"fileCount": {"$size": "$files"}}},
            {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
            {"$set": {"firstSeed": "$firstSeedObject.url"}},
-            {"$unset": ["firstSeedObject"]},
+            {"$unset": ["firstSeedObject", "errors"]},
            {
                "$lookup": {
                    "from": "crawl_configs",
@ -394,6 +394,8 @@ class CrawlOps:

            res["resources"] = await self._resolve_signed_urls(files, org, crawlid)

+        del res["errors"]
+
        crawl = CrawlOut.from_dict(res)

        return await self._resolve_crawl_refs(crawl, org)
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@ -115,6 +115,36 @@ def test_crawls_include_seed_info(admin_auth_headers, default_org_id, admin_craw
        assert crawl["seedCount"] > 0


+def test_crawls_exclude_errors(admin_auth_headers, default_org_id, admin_crawl_id):
+    # Get endpoint
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert "errors" not in data or data.get("errors") is None
+
+    # replay.json endpoint
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert "errors" not in data or data.get("errors") is None
+
+    # List endpoint
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    crawls = r.json()["items"]
+    for crawl in crawls:
+        assert "errors" not in crawl or crawl.get("errors") is None
+
+
 def test_download_wacz():
    r = requests.get(HOST_PREFIX + wacz_path)
    assert r.status_code == 200