Add totalSize to workflow API endpoints (#783)

This commit is contained in:
Tessa Walsh 2023-04-20 17:23:59 -04:00 committed by GitHub
parent 3f41498c5c
commit a2435a013b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 53 additions and 2 deletions

View File

@ -199,6 +199,8 @@ class CrawlConfigOut(CrawlConfig):
firstSeed: Optional[str]
totalSize: Optional[int] = 0
crawlCount: Optional[int] = 0
lastCrawlId: Optional[str]
lastCrawlStartTime: Optional[datetime]
@ -563,8 +565,19 @@ class CrawlConfigOps:
}
}
},
# total size
# {"$set": {"totalSize": {"$sum": "$$finishedCrawls.$$files.size"}}},
{
"$set": {
"totalSize": {
"$sum": {
"$map": {
"input": "$sortedCrawls.files",
"as": "crawlFile",
"in": {"$arrayElemAt": ["$$crawlFile.size", 0]},
}
}
}
}
},
# unset
{"$unset": ["lastCrawl"]},
{"$unset": ["sortedCrawls"]},
@ -680,6 +693,7 @@ class CrawlConfigOps:
cid=crawlconfig.id
)
crawlconfig.crawlCount = crawl_stats["crawl_count"]
crawlconfig.totalSize = crawl_stats["total_size"]
crawlconfig.lastCrawlId = crawl_stats["last_crawl_id"]
crawlconfig.lastCrawlStartTime = crawl_stats["last_crawl_started"]
crawlconfig.lastCrawlTime = crawl_stats["last_crawl_finished"]

View File

@ -384,6 +384,7 @@ class CrawlOps:
"""Get crawl statistics for a crawl_config with id cid."""
stats = {
"crawl_count": 0,
"total_size": 0,
"last_crawl_id": None,
"last_crawl_started": None,
"last_crawl_finished": None,
@ -407,6 +408,13 @@ class CrawlOps:
if user:
stats["last_started_by"] = user.name
total_size = 0
for res in results:
files = res["files"]
for file in files:
total_size += file["size"]
stats["total_size"] = total_size
return stats
async def _resolve_crawl_refs(

View File

@ -205,3 +205,32 @@ def test_verify_revs_history(crawler_auth_headers, default_org_id):
assert len(items) == 2
sorted_data = sorted(items, key=lambda revision: revision["rev"])
assert sorted_data[0]["config"]["scopeType"] == "prefix"
def test_workflow_total_size(crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id):
admin_crawl_cid = ""
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] > 0
items = data["items"]
for workflow in items:
last_crawl_id = workflow.get("lastCrawlId")
if last_crawl_id and last_crawl_id in (admin_crawl_id, crawler_crawl_id):
assert workflow["totalSize"] > 0
if last_crawl_id == admin_crawl_id:
admin_crawl_cid = workflow["id"]
else:
assert workflow["totalSize"] == 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{admin_crawl_cid}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["totalSize"] > 0