From 3f42515914b428910067b8431f9d3d823fb30bdb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 2 Jun 2023 18:52:59 -0700 Subject: [PATCH] =?UTF-8?q?crawls=20list:=20unset=20errors=20in=20crawls?= =?UTF-8?q?=20list=20response=20to=20avoid=20very=20large=E2=80=A6=20(#904?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * crawls list: unset errors in crawls list response to avoid very large responses #872 * Remove errors from crawl replay.json * Add tests to ensure errors are excluded from crawl GET endpoints * Update tests to accept None for errors --------- Co-authored-by: Tessa Walsh --- backend/btrixcloud/crawls.py | 4 +++- backend/test/test_run_crawl.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 6d969227..175082e0 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -280,7 +280,7 @@ class CrawlOps: {"$set": {"fileCount": {"$size": "$files"}}}, {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}}, {"$set": {"firstSeed": "$firstSeedObject.url"}}, - {"$unset": ["firstSeedObject"]}, + {"$unset": ["firstSeedObject", "errors"]}, { "$lookup": { "from": "crawl_configs", @@ -394,6 +394,8 @@ class CrawlOps: res["resources"] = await self._resolve_signed_urls(files, org, crawlid) + del res["errors"] + crawl = CrawlOut.from_dict(res) return await self._resolve_crawl_refs(crawl, org) diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 38d1d36f..ce7d67d6 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -115,6 +115,36 @@ def test_crawls_include_seed_info(admin_auth_headers, default_org_id, admin_craw assert crawl["seedCount"] > 0 +def test_crawls_exclude_errors(admin_auth_headers, default_org_id, admin_crawl_id): + # Get endpoint + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert "errors" not in data or data.get("errors") is None + + # replay.json endpoint + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert "errors" not in data or data.get("errors") is None + + # List endpoint + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + crawls = r.json()["items"] + for crawl in crawls: + assert "errors" not in crawl or crawl.get("errors") is None + + def test_download_wacz(): r = requests.get(HOST_PREFIX + wacz_path) assert r.status_code == 200