diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 49056dee..b6526f66 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -199,6 +199,8 @@ class CrawlConfigOut(CrawlConfig): firstSeed: Optional[str] + totalSize: Optional[int] = 0 + crawlCount: Optional[int] = 0 lastCrawlId: Optional[str] lastCrawlStartTime: Optional[datetime] @@ -563,8 +565,19 @@ class CrawlConfigOps: } } }, - # total size - # {"$set": {"totalSize": {"$sum": "$$finishedCrawls.$$files.size"}}}, + { + "$set": { + "totalSize": { + "$sum": { + "$map": { + "input": "$sortedCrawls.files", + "as": "crawlFile", + "in": {"$arrayElemAt": ["$$crawlFile.size", 0]}, + } + } + } + } + }, # unset {"$unset": ["lastCrawl"]}, {"$unset": ["sortedCrawls"]}, @@ -680,6 +693,7 @@ class CrawlConfigOps: cid=crawlconfig.id ) crawlconfig.crawlCount = crawl_stats["crawl_count"] + crawlconfig.totalSize = crawl_stats["total_size"] crawlconfig.lastCrawlId = crawl_stats["last_crawl_id"] crawlconfig.lastCrawlStartTime = crawl_stats["last_crawl_started"] crawlconfig.lastCrawlTime = crawl_stats["last_crawl_finished"] diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 342cc4da..79bca7d0 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -384,6 +384,7 @@ class CrawlOps: """Get crawl statistics for a crawl_config with id cid.""" stats = { "crawl_count": 0, + "total_size": 0, "last_crawl_id": None, "last_crawl_started": None, "last_crawl_finished": None, @@ -407,6 +408,13 @@ class CrawlOps: if user: stats["last_started_by"] = user.name + total_size = 0 + for res in results: + files = res["files"] + for file in files: + total_size += file["size"] + stats["total_size"] = total_size + return stats async def _resolve_crawl_refs( diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 2945e432..fe86c891 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -205,3 +205,32 @@ def test_verify_revs_history(crawler_auth_headers, default_org_id): assert len(items) == 2 sorted_data = sorted(items, key=lambda revision: revision["rev"]) assert sorted_data[0]["config"]["scopeType"] == "prefix" + + +def test_workflow_total_size(crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id): + admin_crawl_cid = "" + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] > 0 + items = data["items"] + for workflow in items: + last_crawl_id = workflow.get("lastCrawlId") + if last_crawl_id and last_crawl_id in (admin_crawl_id, crawler_crawl_id): + assert workflow["totalSize"] > 0 + if last_crawl_id == admin_crawl_id: + admin_crawl_cid = workflow["id"] + else: + assert workflow["totalSize"] == 0 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{admin_crawl_cid}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["totalSize"] > 0