Precompute crawl file stats (#906)

2023-06-07 19:39:49 -04:00 · 2023-06-07 19:39:49 -04:00 · 120f7ca158
commit 120f7ca158
parent dd757961fc
5 changed files with 62 additions and 5 deletions
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -117,6 +117,9 @@ class Crawl(CrawlConfigCore):

    collections: Optional[List[UUID4]] = []

+    fileSize: int = 0
+    fileCount: int = 0
+

 # ============================================================================
 class CrawlOut(Crawl):
@ -276,8 +279,6 @@ class CrawlOps:
        # pylint: disable=duplicate-code
        aggregate = [
            {"$match": query},
-            {"$set": {"fileSize": {"$sum": "$files.size"}}},
-            {"$set": {"fileCount": {"$size": "$files"}}},
            {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
            {"$set": {"firstSeed": "$firstSeedObject.url"}},
            {"$unset": ["firstSeedObject", "errors"]},
@ -944,16 +945,35 @@ async def add_crawl_errors(crawls, crawl_id, errors):


 # ============================================================================
-async def add_crawl_file(crawls, crawl_id, crawl_file):
+async def add_crawl_file(crawls, crawl_id, crawl_file, size):
    """add new crawl file to crawl"""
    await crawls.find_one_and_update(
        {"_id": crawl_id},
        {
            "$push": {"files": crawl_file.dict()},
+            "$inc": {"fileCount": 1, "fileSize": size},
        },
    )


+# ============================================================================
+async def recompute_crawl_file_count_and_size(crawls, crawl_id):
+    """Fully recompute file count and size for given crawl"""
+    file_count = 0
+    size = 0
+
+    crawl_raw = await crawls.find_one({"_id": crawl_id})
+    crawl = Crawl.from_dict(crawl_raw)
+    for file_ in crawl.files:
+        file_count += 1
+        size += file_.size
+
+    await crawls.find_one_and_update(
+        {"_id": crawl_id},
+        {"$set": {"fileCount": file_count, "fileSize": size}},
+    )
+
+
 # ============================================================================
 # pylint: disable=too-many-arguments, too-many-locals, too-many-statements
 def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep):
--- a/backend/btrixcloud/db.py
+++ b/backend/btrixcloud/db.py
@ -15,7 +15,7 @@ from pymongo.errors import InvalidName
 from .migrations import BaseMigration


-CURR_DB_VERSION = "0007"
+CURR_DB_VERSION = "0008"


 # ============================================================================
--- a/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py
+++ b/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py
@ -0,0 +1,36 @@
+"""
+Migration 0008 - Precomputing crawl file stats
+"""
+from btrixcloud.crawls import recompute_crawl_file_count_and_size
+from btrixcloud.migrations import BaseMigration
+
+
+MIGRATION_VERSION = "0008"
+
+
+class Migration(BaseMigration):
+    """Migration class."""
+
+    def __init__(self, mdb, migration_version=MIGRATION_VERSION):
+        super().__init__(mdb, migration_version)
+
+    async def migrate_up(self):
+        """Perform migration up.
+
+        Add data on crawl file count and size to database that was previously
+        dynamically generated in the API endpoints.
+        """
+        # pylint: disable=duplicate-code
+        crawls = self.mdb["crawls"]
+
+        crawls_to_update = [res async for res in crawls.find({})]
+        if not crawls_to_update:
+            return
+
+        for crawl in crawls_to_update:
+            crawl_id = crawl["_id"]
+            try:
+                await recompute_crawl_file_count_and_size(crawls, crawl_id)
+            # pylint: disable=broad-exception-caught
+            except Exception as err:
+                print(f"Unable to update crawl {crawl_id}: {err}", flush=True)
--- a/backend/btrixcloud/operator.py
+++ b/backend/btrixcloud/operator.py
@ -520,7 +520,7 @@ class BtrixOperator(K8sAPI):

        await redis.incr("filesAddedSize", filecomplete.size)

-        await add_crawl_file(self.crawls, crawl.id, crawl_file)
+        await add_crawl_file(self.crawls, crawl.id, crawl_file, filecomplete.size)

        return True

--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@ -80,6 +80,7 @@ def test_crawl_info(admin_auth_headers, default_org_id, admin_crawl_id):
    )
    data = r.json()
    assert data["fileSize"] == wacz_size
+    assert data["fileCount"] == 1
    assert data["description"] == "Admin Test Crawl description"