crawls list: unset errors in crawls list response to avoid very large… (#904)

* crawls list: unset errors in crawls list response to avoid very large responses #872

* Remove errors from crawl replay.json

* Add tests to ensure errors are excluded from crawl GET endpoints

* Update tests to accept None for errors
---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2023-06-02 18:52:59 -07:00 committed by GitHub
parent 0284903b34
commit 3f42515914
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 33 additions and 1 deletions

View File

@ -280,7 +280,7 @@ class CrawlOps:
{"$set": {"fileCount": {"$size": "$files"}}},
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
{"$set": {"firstSeed": "$firstSeedObject.url"}},
{"$unset": ["firstSeedObject"]},
{"$unset": ["firstSeedObject", "errors"]},
{
"$lookup": {
"from": "crawl_configs",
@ -394,6 +394,8 @@ class CrawlOps:
res["resources"] = await self._resolve_signed_urls(files, org, crawlid)
del res["errors"]
crawl = CrawlOut.from_dict(res)
return await self._resolve_crawl_refs(crawl, org)

View File

@ -115,6 +115,36 @@ def test_crawls_include_seed_info(admin_auth_headers, default_org_id, admin_craw
assert crawl["seedCount"] > 0
def test_crawls_exclude_errors(admin_auth_headers, default_org_id, admin_crawl_id):
# Get endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert "errors" not in data or data.get("errors") is None
# replay.json endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert "errors" not in data or data.get("errors") is None
# List endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls",
headers=admin_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
for crawl in crawls:
assert "errors" not in crawl or crawl.get("errors") is None
def test_download_wacz():
r = requests.get(HOST_PREFIX + wacz_path)
assert r.status_code == 200