From 3f42515914b428910067b8431f9d3d823fb30bdb Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@users.noreply.github.com>
Date: Fri, 2 Jun 2023 18:52:59 -0700
Subject: [PATCH] =?UTF-8?q?crawls=20list:=20unset=20errors=20in=20crawls?=
 =?UTF-8?q?=20list=20response=20to=20avoid=20very=20large=E2=80=A6=20(#904?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* crawls list: unset errors in crawls list response to avoid very large responses #872

* Remove errors from crawl replay.json

* Add tests to ensure errors are excluded from crawl GET endpoints

* Update tests to accept None for errors
---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
---
 backend/btrixcloud/crawls.py   |  4 +++-
 backend/test/test_run_crawl.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
index 6d969227..175082e0 100644
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@@ -280,7 +280,7 @@ class CrawlOps:
             {"$set": {"fileCount": {"$size": "$files"}}},
             {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
             {"$set": {"firstSeed": "$firstSeedObject.url"}},
-            {"$unset": ["firstSeedObject"]},
+            {"$unset": ["firstSeedObject", "errors"]},
             {
                 "$lookup": {
                     "from": "crawl_configs",
@@ -394,6 +394,8 @@ class CrawlOps:
 
             res["resources"] = await self._resolve_signed_urls(files, org, crawlid)
 
+        del res["errors"]
+
         crawl = CrawlOut.from_dict(res)
 
         return await self._resolve_crawl_refs(crawl, org)
diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py
index 38d1d36f..ce7d67d6 100644
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@@ -115,6 +115,36 @@ def test_crawls_include_seed_info(admin_auth_headers, default_org_id, admin_craw
         assert crawl["seedCount"] > 0
 
 
+def test_crawls_exclude_errors(admin_auth_headers, default_org_id, admin_crawl_id):
+    # Get endpoint
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert "errors" not in data or data.get("errors") is None
+
+    # replay.json endpoint
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert "errors" not in data or data.get("errors") is None
+
+    # List endpoint
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    crawls = r.json()["items"]
+    for crawl in crawls:
+        assert "errors" not in crawl or crawl.get("errors") is None
+
+
 def test_download_wacz():
     r = requests.get(HOST_PREFIX + wacz_path)
     assert r.status_code == 200