diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 175082e0..21a877dc 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -117,6 +117,9 @@ class Crawl(CrawlConfigCore): collections: Optional[List[UUID4]] = [] + fileSize: int = 0 + fileCount: int = 0 + # ============================================================================ class CrawlOut(Crawl): @@ -276,8 +279,6 @@ class CrawlOps: # pylint: disable=duplicate-code aggregate = [ {"$match": query}, - {"$set": {"fileSize": {"$sum": "$files.size"}}}, - {"$set": {"fileCount": {"$size": "$files"}}}, {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}}, {"$set": {"firstSeed": "$firstSeedObject.url"}}, {"$unset": ["firstSeedObject", "errors"]}, @@ -944,16 +945,35 @@ async def add_crawl_errors(crawls, crawl_id, errors): # ============================================================================ -async def add_crawl_file(crawls, crawl_id, crawl_file): +async def add_crawl_file(crawls, crawl_id, crawl_file, size): """add new crawl file to crawl""" await crawls.find_one_and_update( {"_id": crawl_id}, { "$push": {"files": crawl_file.dict()}, + "$inc": {"fileCount": 1, "fileSize": size}, }, ) +# ============================================================================ +async def recompute_crawl_file_count_and_size(crawls, crawl_id): + """Fully recompute file count and size for given crawl""" + file_count = 0 + size = 0 + + crawl_raw = await crawls.find_one({"_id": crawl_id}) + crawl = Crawl.from_dict(crawl_raw) + for file_ in crawl.files: + file_count += 1 + size += file_.size + + await crawls.find_one_and_update( + {"_id": crawl_id}, + {"$set": {"fileCount": file_count, "fileSize": size}}, + ) + + # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, too-many-statements def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep): diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index f740096f..341c840a 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -15,7 +15,7 @@ from pymongo.errors import InvalidName from .migrations import BaseMigration -CURR_DB_VERSION = "0007" +CURR_DB_VERSION = "0008" # ============================================================================ diff --git a/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py b/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py new file mode 100644 index 00000000..1ee6392c --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py @@ -0,0 +1,36 @@ +""" +Migration 0008 - Precomputing crawl file stats +""" +from btrixcloud.crawls import recompute_crawl_file_count_and_size +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0008" + + +class Migration(BaseMigration): + """Migration class.""" + + def __init__(self, mdb, migration_version=MIGRATION_VERSION): + super().__init__(mdb, migration_version) + + async def migrate_up(self): + """Perform migration up. + + Add data on crawl file count and size to database that was previously + dynamically generated in the API endpoints. + """ + # pylint: disable=duplicate-code + crawls = self.mdb["crawls"] + + crawls_to_update = [res async for res in crawls.find({})] + if not crawls_to_update: + return + + for crawl in crawls_to_update: + crawl_id = crawl["_id"] + try: + await recompute_crawl_file_count_and_size(crawls, crawl_id) + # pylint: disable=broad-exception-caught + except Exception as err: + print(f"Unable to update crawl {crawl_id}: {err}", flush=True) diff --git a/backend/btrixcloud/operator.py b/backend/btrixcloud/operator.py index 6f002baa..26eb1a29 100644 --- a/backend/btrixcloud/operator.py +++ b/backend/btrixcloud/operator.py @@ -520,7 +520,7 @@ class BtrixOperator(K8sAPI): await redis.incr("filesAddedSize", filecomplete.size) - await add_crawl_file(self.crawls, crawl.id, crawl_file) + await add_crawl_file(self.crawls, crawl.id, crawl_file, filecomplete.size) return True diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index ce7d67d6..071f39ff 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -80,6 +80,7 @@ def test_crawl_info(admin_auth_headers, default_org_id, admin_crawl_id): ) data = r.json() assert data["fileSize"] == wacz_size + assert data["fileCount"] == 1 assert data["description"] == "Admin Test Crawl description"