Pre-compute workflow last crawl information (#812)

* Precompute config crawl stats * Includes a database migration to move preciously dynamically computed crawl stats for workflows into the CrawlConfig model. * Add crawls.finished descending index * Add last crawl fields to workflow tests
2023-05-05 18:12:52 -04:00 · 2023-05-05 18:12:52 -04:00 · 8281ba723e
commit 8281ba723e
parent 9fcbc3f87e
7 changed files with 170 additions and 166 deletions
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@ -178,6 +178,16 @@ class CrawlConfig(CrawlConfigCore):

    rev: int = 0

+    crawlCount: Optional[int] = 0
+    totalSize: Optional[int] = 0
+
+    lastCrawlId: Optional[str]
+    lastCrawlStartTime: Optional[datetime]
+    lastStartedBy: Optional[UUID4]
+    lastCrawlTime: Optional[datetime]
+    lastCrawlState: Optional[str]
+    lastCrawlSize: Optional[int]
+
    def get_raw_config(self):
        """serialize config for browsertrix-crawler"""
        return self.config.dict(exclude_unset=True, exclude_none=True)
@ -200,15 +210,6 @@ class CrawlConfigOut(CrawlConfig):

    firstSeed: Optional[str]

-    totalSize: Optional[int] = 0
-
-    crawlCount: Optional[int] = 0
-    lastCrawlId: Optional[str]
-    lastCrawlStartTime: Optional[datetime]
-    lastCrawlTime: Optional[datetime]
-    lastCrawlState: Optional[str]
-    lastCrawlSize: Optional[int]
-

 # ============================================================================
 class CrawlConfigIdNameOut(BaseMongoModel):
@ -241,6 +242,7 @@ class CrawlConfigOps:

    def __init__(self, dbclient, mdb, user_manager, org_ops, crawl_manager, profiles):
        self.dbclient = dbclient
+        self.crawls = mdb["crawls"]
        self.crawl_configs = mdb["crawl_configs"]
        self.config_revs = mdb["configs_revs"]
        self.user_manager = user_manager
@ -264,7 +266,7 @@ class CrawlConfigOps:
        self.crawl_ops = ops

    async def init_index(self):
-        """init index for crawls db"""
+        """init index for crawlconfigs db collection"""
        await self.crawl_configs.create_index(
            [("oid", pymongo.HASHED), ("inactive", pymongo.ASCENDING)]
        )
@ -509,84 +511,6 @@ class CrawlConfigOps:
            # Set firstSeed
            {"$set": {"firstSeed": "$firstSeedObject.url"}},
            {"$unset": ["firstSeedObject"]},
-            {
-                "$lookup": {
-                    "from": "crawls",
-                    "localField": "_id",
-                    "foreignField": "cid",
-                    "as": "configCrawls",
-                },
-            },
-            # Filter workflow crawls on finished and active
-            {
-                "$set": {
-                    "finishedCrawls": {
-                        "$filter": {
-                            "input": "$configCrawls",
-                            "as": "filterCrawls",
-                            "cond": {
-                                "$and": [
-                                    {"$ne": ["$$filterCrawls.finished", None]},
-                                    {"$ne": ["$$filterCrawls.inactive", True]},
-                                ]
-                            },
-                        }
-                    }
-                }
-            },
-            # Set crawl count to number of finished crawls
-            {"$set": {"crawlCount": {"$size": "$finishedCrawls"}}},
-            # Sort finished crawls by finished time descending to get latest
-            {
-                "$set": {
-                    "sortedCrawls": {
-                        "$sortArray": {
-                            "input": "$finishedCrawls",
-                            "sortBy": {"finished": -1},
-                        }
-                    }
-                }
-            },
-            {"$unset": ["finishedCrawls"]},
-            {"$set": {"lastCrawl": {"$arrayElemAt": ["$sortedCrawls", 0]}}},
-            {"$set": {"lastCrawlId": "$lastCrawl._id"}},
-            {"$set": {"lastCrawlStartTime": "$lastCrawl.started"}},
-            {"$set": {"lastCrawlTime": "$lastCrawl.finished"}},
-            {"$set": {"lastCrawlState": "$lastCrawl.state"}},
-            # Get userid of last started crawl
-            {"$set": {"lastStartedBy": "$lastCrawl.userid"}},
-            {"$set": {"lastCrawlSize": {"$sum": "$lastCrawl.files.size"}}},
-            {
-                "$lookup": {
-                    "from": "users",
-                    "localField": "lastStartedBy",
-                    "foreignField": "id",
-                    "as": "lastStartedByName",
-                },
-            },
-            {
-                "$set": {
-                    "lastStartedByName": {
-                        "$arrayElemAt": ["$lastStartedByName.name", 0]
-                    }
-                }
-            },
-            {
-                "$set": {
-                    "totalSize": {
-                        "$sum": {
-                            "$map": {
-                                "input": "$sortedCrawls.files",
-                                "as": "crawlFile",
-                                "in": {"$arrayElemAt": ["$$crawlFile.size", 0]},
-                            }
-                        }
-                    }
-                }
-            },
-            # unset
-            {"$unset": ["lastCrawl"]},
-            {"$unset": ["sortedCrawls"]},
        ]

        if first_seed:
@ -611,6 +535,19 @@ class CrawlConfigOps:
                    },
                },
                {"$set": {"createdByName": {"$arrayElemAt": ["$userName.name", 0]}}},
+                {
+                    "$lookup": {
+                        "from": "users",
+                        "localField": "lastStartedBy",
+                        "foreignField": "id",
+                        "as": "startedName",
+                    },
+                },
+                {
+                    "$set": {
+                        "lastStartedByName": {"$arrayElemAt": ["$startedName.name", 0]}
+                    }
+                },
                {
                    "$lookup": {
                        "from": "users",
@ -648,22 +585,12 @@ class CrawlConfigOps:
        except (IndexError, ValueError):
            total = 0

-        # crawls = await self.crawl_manager.list_running_crawls(oid=org.id)
-        crawls, _ = await self.crawl_ops.list_crawls(
-            org=org,
-            running_only=True,
-            # Set high so that when we lower default we still get all running crawls
-            page_size=1_000,
-        )
-        running = {}
-        for crawl in crawls:
-            running[crawl.cid] = crawl
-
        configs = []
        for res in items:
            config = CrawlConfigOut.from_dict(res)
            # pylint: disable=invalid-name
-            self._add_curr_crawl_stats(config, running.get(config.id))
+            if not config.inactive:
+                self._add_curr_crawl_stats(config, await self.get_running_crawl(config))
            configs.append(config)

        return configs, total
@ -693,20 +620,9 @@ class CrawlConfigOps:

        return None

-    async def _annotate_with_crawl_stats(self, crawlconfig: CrawlConfigOut):
-        """Annotate crawlconfig with information about associated crawls"""
-        crawl_stats = await self.crawl_ops.get_latest_crawl_and_count_by_config(
-            cid=crawlconfig.id
-        )
-        crawlconfig.crawlCount = crawl_stats["crawl_count"]
-        crawlconfig.totalSize = crawl_stats["total_size"]
-        crawlconfig.lastCrawlId = crawl_stats["last_crawl_id"]
-        crawlconfig.lastCrawlStartTime = crawl_stats["last_crawl_started"]
-        crawlconfig.lastCrawlTime = crawl_stats["last_crawl_finished"]
-        crawlconfig.lastStartedByName = crawl_stats["last_started_by"]
-        crawlconfig.lastCrawlState = crawl_stats["last_crawl_state"]
-        crawlconfig.lastCrawlSize = crawl_stats["last_crawl_size"]
-        return crawlconfig
+    async def update_crawl_stats(self, cid: uuid.UUID):
+        """Update crawl count, total size, and last crawl information for config."""
+        await update_config_crawl_stats(self.crawl_configs, self.crawls, cid)

    def _add_curr_crawl_stats(self, crawlconfig, crawl):
        """Add stats from current running crawl, if any"""
@ -745,13 +661,17 @@ class CrawlConfigOps:
        if modified_user:
            crawlconfig.modifiedByName = modified_user.name

+        if crawlconfig.lastStartedBy:
+            last_started_user = await self.user_manager.get(crawlconfig.lastStartedBy)
+            # pylint: disable=invalid-name
+            if last_started_user:
+                crawlconfig.lastStartedByName = last_started_user.name
+
        if crawlconfig.profileid:
            crawlconfig.profileName = await self.profiles.get_profile_name(
                crawlconfig.profileid, org
            )

-        crawlconfig = await self._annotate_with_crawl_stats(crawlconfig)
-
        return crawlconfig

    async def get_crawl_config(
@ -957,6 +877,58 @@ async def inc_crawl_count(crawl_configs, cid: uuid.UUID):
    )


+# ============================================================================
+# pylint: disable=too-many-locals
+async def update_config_crawl_stats(crawl_configs, crawls, cid: uuid.UUID):
+    """re-calculate and update crawl statistics for config"""
+    update_query = {
+        "crawlCount": 0,
+        "totalSize": 0,
+        "lastCrawlId": None,
+        "lastCrawlStartTime": None,
+        "lastStartedBy": None,
+        "lastCrawlTime": None,
+        "lastCrawlState": None,
+        "lastCrawlSize": None,
+    }
+
+    match_query = {"cid": cid, "finished": {"$ne": None}, "inactive": {"$ne": True}}
+    cursor = crawls.find(match_query).sort("finished", pymongo.DESCENDING)
+    results = await cursor.to_list(length=10_000)
+    if results:
+        update_query["crawlCount"] = len(results)
+
+        last_crawl = results[0]
+        update_query["lastCrawlId"] = str(last_crawl.get("_id"))
+        update_query["lastCrawlStartTime"] = last_crawl.get("started")
+        update_query["lastStartedBy"] = last_crawl.get("userid")
+        update_query["lastCrawlTime"] = last_crawl.get("finished")
+        update_query["lastCrawlState"] = last_crawl.get("state")
+        update_query["lastCrawlSize"] = sum(
+            file_.get("size", 0) for file_ in last_crawl.get("files", [])
+        )
+
+        total_size = 0
+        for res in results:
+            files = res.get("files", [])
+            for file in files:
+                total_size += file.get("size", 0)
+        update_query["totalSize"] = total_size
+
+    result = await crawl_configs.find_one_and_update(
+        {"_id": cid, "inactive": {"$ne": True}},
+        {"$set": update_query},
+        return_document=pymongo.ReturnDocument.AFTER,
+    )
+
+    if not result:
+        raise HTTPException(
+            status_code=404, detail=f"Crawl Config '{cid}' not found to update"
+        )
+
+    return result
+
+
 # ============================================================================
 # pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
 def init_crawl_config_api(
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -196,6 +196,10 @@ class CrawlOps:

        self.presign_duration = int(os.environ.get("PRESIGN_DURATION_SECONDS", 3600))

+    async def init_index(self):
+        """init index for crawls db collection"""
+        await self.crawls.create_index([("finished", pymongo.DESCENDING)])
+
    async def list_crawls(
        self,
        org: Optional[Organization] = None,
@ -380,45 +384,6 @@ class CrawlOps:

        return await self._resolve_crawl_refs(crawl, org)

-    async def get_latest_crawl_and_count_by_config(self, cid: str):
-        """Get crawl statistics for a crawl_config with id cid."""
-        stats = {
-            "crawl_count": 0,
-            "total_size": 0,
-            "last_crawl_id": None,
-            "last_crawl_started": None,
-            "last_crawl_finished": None,
-            "last_crawl_state": None,
-            "last_started_by": None,
-            "last_crawl_size": 0,
-        }
-
-        match_query = {"cid": cid, "finished": {"$ne": None}, "inactive": {"$ne": True}}
-        cursor = self.crawls.find(match_query).sort("finished", pymongo.DESCENDING)
-        results = await cursor.to_list(length=1000)
-        if results:
-            stats["crawl_count"] = len(results)
-
-            last_crawl = Crawl.from_dict(results[0])
-            stats["last_crawl_id"] = str(last_crawl.id)
-            stats["last_crawl_started"] = last_crawl.started
-            stats["last_crawl_finished"] = last_crawl.finished
-            stats["last_crawl_state"] = last_crawl.state
-            stats["last_crawl_size"] = sum(file_.size for file_ in last_crawl.files)
-
-            user = await self.user_manager.get(last_crawl.userid)
-            if user:
-                stats["last_started_by"] = user.name
-
-            total_size = 0
-            for res in results:
-                files = res["files"]
-                for file in files:
-                    total_size += file["size"]
-            stats["total_size"] = total_size
-
-        return stats
-
    async def _resolve_crawl_refs(
        self,
        crawl: Union[CrawlOut, ListCrawlOut],
@ -527,13 +492,22 @@ class CrawlOps:

    async def delete_crawls(self, org: Organization, delete_list: DeleteCrawlList):
        """Delete a list of crawls by id for given org"""
+        cids_to_update = set()
+
        for crawl_id in delete_list.crawl_ids:
            await self._delete_crawl_files(org, crawl_id)
+            await self.remove_crawl_from_collections(org, crawl_id)
+
+            crawl = await self.get_crawl_raw(crawl_id, org)
+            cids_to_update.add(crawl["cid"])

        res = await self.crawls.delete_many(
            {"_id": {"$in": delete_list.crawl_ids}, "oid": org.id}
        )

+        for cid in cids_to_update:
+            await self.crawl_configs.update_crawl_stats(cid)
+
        return res.deleted_count

    async def _delete_crawl_files(self, org: Organization, crawl_id: str):
@ -837,7 +811,7 @@ class CrawlOps:

        return resp

-    async def remove_crawl_from_collections(self, oid: uuid.UUID, crawl_id: str):
+    async def remove_crawl_from_collections(self, org: Organization, crawl_id: str):
        """Remove crawl with given crawl_id from all collections it belongs to"""
        collections = [
            coll["name"]
@ -845,7 +819,7 @@ class CrawlOps:
        ]
        for collection_name in collections:
            await self.collections.find_one_and_update(
-                {"name": collection_name, "oid": oid},
+                {"name": collection_name, "oid": org.id},
                {"$pull": {"crawlIds": crawl_id}},
            )

@ -884,7 +858,11 @@ async def add_new_crawl(
 # ============================================================================
 async def update_crawl(crawls, crawl_id, **kwargs):
    """update crawl state in db"""
-    await crawls.find_one_and_update({"_id": crawl_id}, {"$set": kwargs})
+    return await crawls.find_one_and_update(
+        {"_id": crawl_id},
+        {"$set": kwargs},
+        return_document=pymongo.ReturnDocument.AFTER,
+    )


 # ============================================================================
@ -1056,8 +1034,6 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
                        status_code=400, detail=f"Error Stopping Crawl: {exc}"
                    )

-        await ops.remove_crawl_from_collections(crawl.oid, crawl.id)
-
        res = await ops.delete_crawls(org, delete_list)

        return {"deleted": res}
--- a/backend/btrixcloud/db.py
+++ b/backend/btrixcloud/db.py
@ -13,7 +13,7 @@ from pymongo.errors import InvalidName
 from .migrations import BaseMigration


-CURR_DB_VERSION = "0005"
+CURR_DB_VERSION = "0006"


 # ============================================================================
@ -55,6 +55,7 @@ async def update_and_prepare_db(
    mdb,
    user_manager,
    org_ops,
+    crawl_ops,
    crawl_config_ops,
    coll_ops,
    invite_ops,
@ -70,7 +71,7 @@ async def update_and_prepare_db(
    print("Database setup started", flush=True)
    if await run_db_migrations(mdb, user_manager):
        await drop_indexes(mdb)
-    await create_indexes(org_ops, crawl_config_ops, coll_ops, invite_ops)
+    await create_indexes(org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops)
    await user_manager.create_super_user()
    await org_ops.create_default_org()
    print("Database updated and ready", flush=True)
@ -134,10 +135,11 @@ async def drop_indexes(mdb):


 # ============================================================================
-async def create_indexes(org_ops, crawl_config_ops, coll_ops, invite_ops):
+async def create_indexes(org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops):
    """Create database indexes."""
    print("Creating database indexes", flush=True)
    await org_ops.init_index()
+    await crawl_ops.init_index()
    await crawl_config_ops.init_index()
    await coll_ops.init_index()
    await invite_ops.init_index()
--- a/backend/btrixcloud/main.py
+++ b/backend/btrixcloud/main.py
@ -114,7 +114,7 @@ def main():
    if run_once_lock("btrix-init-db"):
        asyncio.create_task(
            update_and_prepare_db(
-                mdb, user_manager, org_ops, crawl_config_ops, coll_ops, invites
+                mdb, user_manager, org_ops, crawls, crawl_config_ops, coll_ops, invites
            )
        )

--- a/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py
+++ b/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py
@ -0,0 +1,31 @@
+"""
+Migration 0006 - Precomputing workflow crawl stats
+"""
+from btrixcloud.crawlconfigs import update_config_crawl_stats
+from btrixcloud.migrations import BaseMigration
+
+
+MIGRATION_VERSION = "0006"
+
+
+class Migration(BaseMigration):
+    """Migration class."""
+
+    def __init__(self, mdb, migration_version=MIGRATION_VERSION):
+        super().__init__(mdb, migration_version)
+
+    async def migrate_up(self):
+        """Perform migration up.
+
+        Add data on workflow crawl statistics that was previously dynamically
+        computed when needed to the database.
+        """
+        crawl_configs = self.mdb["crawl_configs"]
+        crawls = self.mdb["crawls"]
+
+        configs = [res async for res in crawl_configs.find({})]
+        if not configs:
+            return
+
+        for config in configs:
+            await update_config_crawl_stats(crawl_configs, crawls, config["_id"])
--- a/backend/btrixcloud/operator.py
+++ b/backend/btrixcloud/operator.py
@ -19,6 +19,7 @@ from .k8sapi import K8sAPI

 from .db import init_db
 from .orgs import inc_org_stats
+from .crawlconfigs import update_config_crawl_stats
 from .crawls import (
    CrawlFile,
    CrawlCompleteIn,
@ -97,6 +98,7 @@ class BtrixOperator(K8sAPI):

        _, mdb = init_db()
        self.crawls = mdb["crawls"]
+        self.crawl_configs = mdb["crawl_configs"]
        self.orgs = mdb["organizations"]

        self.done_key = "crawls-done"
@ -446,7 +448,10 @@ class BtrixOperator(K8sAPI):
        if stats:
            kwargs["stats"] = stats

-        await update_crawl(self.crawls, crawl_id, **kwargs)
+        crawl = await update_crawl(self.crawls, crawl_id, **kwargs)
+        crawl_cid = crawl.get("cid")
+
+        await update_config_crawl_stats(self.crawl_configs, self.crawls, crawl_cid)

        if redis:
            await self.add_crawl_errors_to_db(redis, crawl_id)
--- a/backend/test/test_crawlconfigs.py
+++ b/backend/test/test_crawlconfigs.py
@ -208,7 +208,7 @@ def test_verify_revs_history(crawler_auth_headers, default_org_id):
    assert sorted_data[0]["config"]["scopeType"] == "prefix"


-def test_workflow_total_size(
+def test_workflow_total_size_and_last_crawl_stats(
    crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
 ):
    admin_crawl_cid = ""
@ -225,8 +225,18 @@ def test_workflow_total_size(
        last_crawl_id = workflow.get("lastCrawlId")
        if last_crawl_id and last_crawl_id in (admin_crawl_id, crawler_crawl_id):
            assert workflow["totalSize"] > 0
+            assert workflow["crawlCount"] > 0
+
+            assert workflow["lastCrawlId"]
+            assert workflow["lastCrawlStartTime"]
+            assert workflow["lastStartedByName"]
+            assert workflow["lastCrawlTime"]
+            assert workflow["lastCrawlState"]
+            assert workflow["lastCrawlSize"] > 0
+
            if last_crawl_id == admin_crawl_id:
                admin_crawl_cid = workflow["id"]
+                assert admin_crawl_cid
        else:
            assert workflow["totalSize"] == 0

@ -237,3 +247,11 @@ def test_workflow_total_size(
    assert r.status_code == 200
    data = r.json()
    assert data["totalSize"] > 0
+    assert data["crawlCount"] > 0
+
+    assert data["lastCrawlId"]
+    assert data["lastCrawlStartTime"]
+    assert data["lastStartedByName"]
+    assert data["lastCrawlTime"]
+    assert data["lastCrawlState"]
+    assert data["lastCrawlSize"] > 0