crawls list: unset errors in crawls list response to avoid very large… (#904)

* crawls list: unset errors in crawls list response to avoid very large responses #872 * Remove errors from crawl replay.json * Add tests to ensure errors are excluded from crawl GET endpoints * Update tests to accept None for errors --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-06-02 18:52:59 -07:00 · 2023-06-02 18:52:59 -07:00 · 3f42515914
commit 3f42515914
parent 0284903b34
2 changed files with 33 additions and 1 deletions
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -280,7 +280,7 @@ class CrawlOps:
            {"$set": {"fileCount": {"$size": "$files"}}},
            {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
            {"$set": {"firstSeed": "$firstSeedObject.url"}},
-            {"$unset": ["firstSeedObject"]},
+            {"$unset": ["firstSeedObject", "errors"]},
            {
                "$lookup": {
                    "from": "crawl_configs",
@ -394,6 +394,8 @@ class CrawlOps:
            res["resources"] = await self._resolve_signed_urls(files, org, crawlid)
        del res["errors"]
        crawl = CrawlOut.from_dict(res)
        return await self._resolve_crawl_refs(crawl, org)
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@ -115,6 +115,36 @@ def test_crawls_include_seed_info(admin_auth_headers, default_org_id, admin_craw
        assert crawl["seedCount"] > 0
 def test_crawls_exclude_errors(admin_auth_headers, default_org_id, admin_crawl_id):
    # Get endpoint
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
        headers=admin_auth_headers,
    )
    assert r.status_code == 200
    data = r.json()
    assert "errors" not in data or data.get("errors") is None
    # replay.json endpoint
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
        headers=admin_auth_headers,
    )
    assert r.status_code == 200
    data = r.json()
    assert "errors" not in data or data.get("errors") is None
    # List endpoint
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls",
        headers=admin_auth_headers,
    )
    assert r.status_code == 200
    crawls = r.json()["items"]
    for crawl in crawls:
        assert "errors" not in crawl or crawl.get("errors") is None
 def test_download_wacz():
    r = requests.get(HOST_PREFIX + wacz_path)
    assert r.status_code == 200