optimization: convert all uses of 'async for' to use iterator directly (#1229)

- optimization: convert all uses of 'async for' to use iterator directly instead of converting to list to avoid unbounded size lists - additional cursor.to_list() to async for conversions for stats computation, simply crawlconfigs stats computation --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-09-28 12:31:08 -07:00 · 2023-09-28 12:31:08 -07:00 · 7eac0fdf95
commit 7eac0fdf95
parent cabf4ccc21
14 changed files with 64 additions and 107 deletions
--- a/backend/btrixcloud/colls.py
+++ b/backend/btrixcloud/colls.py
@ -322,9 +322,7 @@ class CollectionOps:
        total_size = 0
        tags = []

-        cursor = self.crawls.find({"collectionIds": collection_id})
-        crawls = await cursor.to_list(length=10_000)
-        for crawl in crawls:
+        async for crawl in self.crawls.find({"collectionIds": collection_id}):
            if crawl["state"] not in SUCCESSFUL_STATES:
                continue
            crawl_count += 1
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@ -605,7 +605,7 @@ class CrawlConfigOps:
        total = await self.config_revs.count_documents(match_query)

        cursor = self.config_revs.find({"cid": cid}, skip=skip, limit=page_size)
-        results = await cursor.to_list(length=1000)
+        results = await cursor.to_list(length=page_size)
        revisions = [ConfigRevision.from_dict(res) for res in results]

        return revisions, total
@ -704,8 +704,7 @@ class CrawlConfigOps:
        descriptions = [description for description in descriptions if description]

        first_seeds = set()
-        configs = [config async for config in self.crawl_configs.find({"oid": org.id})]
-        for config in configs:
+        async for config in self.crawl_configs.find({"oid": org.id}):
            first_seed = config["config"]["seeds"][0]["url"]
            first_seeds.add(first_seed)

@ -802,38 +801,46 @@ async def stats_recompute_all(crawl_configs, crawls, cid: uuid.UUID):
    }

    match_query = {"cid": cid, "finished": {"$ne": None}}
-    cursor = crawls.find(match_query).sort("finished", pymongo.DESCENDING)
-    results = await cursor.to_list(length=10_000)
-    if results:
-        update_query["crawlCount"] = len(results)
-
-        update_query["crawlSuccessfulCount"] = len(
-            [res for res in results if res["state"] not in FAILED_STATES]
-        )
-
-        last_crawl = results[0]
-
-        last_crawl_finished = last_crawl.get("finished")
-
-        update_query["lastCrawlId"] = str(last_crawl.get("_id"))
-        update_query["lastCrawlStartTime"] = last_crawl.get("started")
-        update_query["lastStartedBy"] = last_crawl.get("userid")
-        update_query["lastStartedByName"] = last_crawl.get("userName")
-        update_query["lastCrawlTime"] = last_crawl_finished
-        update_query["lastCrawlState"] = last_crawl.get("state")
-        update_query["lastCrawlSize"] = sum(
-            file_.get("size", 0) for file_ in last_crawl.get("files", [])
-        )
-
-        if last_crawl_finished:
-            update_query["lastRun"] = last_crawl_finished
+    count = await crawls.count_documents(match_query)
+    if count:
+        update_query["crawlCount"] = count

        total_size = 0
-        for res in results:
+        successful_count = 0
+
+        last_crawl: Optional[dict[str, object]] = None
+        last_crawl_size = 0
+
+        async for res in crawls.find(match_query).sort("finished", pymongo.DESCENDING):
            files = res.get("files", [])
+            crawl_size = 0
            for file in files:
-                total_size += file.get("size", 0)
-        update_query["totalSize"] = total_size
+                crawl_size += file.get("size", 0)
+
+            total_size += crawl_size
+
+            if res["state"] not in FAILED_STATES:
+                successful_count += 1
+
+            last_crawl = res
+            last_crawl_size = crawl_size
+
+        if last_crawl:
+            update_query["totalSize"] = total_size
+            update_query["crawlSuccessfulCount"] = successful_count
+
+            update_query["lastCrawlId"] = str(last_crawl.get("_id"))
+            update_query["lastCrawlStartTime"] = last_crawl.get("started")
+            update_query["lastStartedBy"] = last_crawl.get("userid")
+            update_query["lastStartedByName"] = last_crawl.get("userName")
+            update_query["lastCrawlState"] = last_crawl.get("state")
+            update_query["lastCrawlSize"] = last_crawl_size
+
+            last_crawl_finished = last_crawl.get("finished")
+            update_query["lastCrawlTime"] = last_crawl_finished
+
+            if last_crawl_finished:
+                update_query["lastRun"] = last_crawl_finished

    result = await crawl_configs.find_one_and_update(
        {"_id": cid, "inactive": {"$ne": True}},
--- a/backend/btrixcloud/migrations/migration_0003_mutable_crawl_configs.py
+++ b/backend/btrixcloud/migrations/migration_0003_mutable_crawl_configs.py
@ -26,8 +26,7 @@ class Migration(BaseMigration):
        crawl_configs = self.mdb["crawl_configs"]

        # Return early if there are no configs
-        crawl_config_results = [res async for res in crawl_configs.find({})]
-        if not crawl_config_results:
+        if not await crawl_configs.count_documents({}):
            return

        utc_now_datetime = datetime.utcnow().replace(microsecond=0, tzinfo=None)
@ -45,14 +44,9 @@ class Migration(BaseMigration):
        await crawl_configs.update_many({}, [{"$set": {"modified": "$created"}}])
        await crawl_configs.update_many({}, {"$set": {"rev": 0}})

-        # Return early if there are no crawls
-        crawl_results = [res async for res in crawls.find({})]
-        if not crawl_results:
-            return
-
        await crawls.update_many({}, {"$set": {"cid_rev": 0}})

-        for crawl_result in crawl_results:
+        async for crawl_result in crawls.find({}):
            config_result = await crawl_configs.find_one({"_id": crawl_result["cid"]})
            if not config_result:
                continue
--- a/backend/btrixcloud/migrations/migration_0004_config_seeds.py
+++ b/backend/btrixcloud/migrations/migration_0004_config_seeds.py
@ -25,11 +25,8 @@ class Migration(BaseMigration):

        # Migrate workflows
        crawl_configs = self.mdb["crawl_configs"]
-        crawl_config_results = [res async for res in crawl_configs.find({})]
-        if not crawl_config_results:
-            return

-        for config_dict in crawl_config_results:
+        async for config_dict in crawl_configs.find({}):
            seeds_to_migrate = []
            seed_dicts = []

@ -65,9 +62,8 @@ class Migration(BaseMigration):

        # Migrate seeds copied into crawls
        crawls = self.mdb["crawls"]
-        crawl_results = [res async for res in crawls.find({})]

-        for crawl_dict in crawl_results:
+        async for crawl_dict in crawls.find({}):
            seeds_to_migrate = []
            seed_dicts = []

@ -102,15 +98,13 @@ class Migration(BaseMigration):
                )

        # Test migration
-        crawl_config_results = [res async for res in crawl_configs.find({})]
-        for config_dict in crawl_config_results:
+        async for config_dict in crawl_configs.find({}):
            config = CrawlConfig.from_dict(config_dict)
            for seed in config.config.seeds:
                assert isinstance(seed, Seed)
                assert seed.url

-        crawl_results = [res async for res in crawls.find({})]
-        for crawl_dict in crawl_results:
+        async for crawl_dict in crawls.find({}):
            crawl = Crawl.from_dict(crawl_dict)
            for seed in crawl.config.seeds:
                assert isinstance(seed, Seed)
--- a/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py
+++ b/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py
@ -37,8 +37,7 @@ class Migration(BaseMigration):
                {"schedule": {"$nin": ["", None]}},
            ]
        }
-        configs_to_update = [res async for res in crawl_configs.find(match_query)]
-        for config_dict in configs_to_update:
+        async for config_dict in crawl_configs.find(match_query):
            config = CrawlConfig.from_dict(config_dict)
            print(
                f"Updating Crawl Config {config.id}: schedule: {config.schedule}, "
--- a/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py
+++ b/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py
@ -24,11 +24,7 @@ class Migration(BaseMigration):
        crawl_configs = self.mdb["crawl_configs"]
        crawls = self.mdb["crawls"]

-        configs = [res async for res in crawl_configs.find({"inactive": {"$ne": True}})]
-        if not configs:
-            return
-
-        for config in configs:
+        async for config in crawl_configs.find({"inactive": {"$ne": True}}):
            config_id = config["_id"]
            try:
                await stats_recompute_all(crawl_configs, crawls, config_id)
--- a/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py
+++ b/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py
@ -24,11 +24,7 @@ class Migration(BaseMigration):
        crawls = self.mdb["crawls"]

        # Update workflows crawl stats to populate crawlSuccessfulCount
-        configs = [res async for res in crawl_configs.find({"inactive": {"$ne": True}})]
-        if not configs:
-            return
-
-        for config in configs:
+        async for config in crawl_configs.find({"inactive": {"$ne": True}}):
            config_id = config["_id"]
            try:
                await stats_recompute_all(crawl_configs, crawls, config_id)
--- a/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py
+++ b/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py
@ -23,11 +23,7 @@ class Migration(BaseMigration):
        # pylint: disable=duplicate-code
        crawls = self.mdb["crawls"]

-        crawls_to_update = [res async for res in crawls.find({})]
-        if not crawls_to_update:
-            return
-
-        for crawl in crawls_to_update:
+        async for crawl in crawls.find({}):
            crawl_id = crawl["_id"]
            try:
                await recompute_crawl_file_count_and_size(crawls, crawl_id)
--- a/backend/btrixcloud/migrations/migration_0010_collection_total_size.py
+++ b/backend/btrixcloud/migrations/migration_0010_collection_total_size.py
@ -22,11 +22,7 @@ class Migration(BaseMigration):
        # pylint: disable=duplicate-code
        coll_ops = CollectionOps(self.mdb, None, None, None)

-        colls_to_update = [res async for res in coll_ops.collections.find({})]
-        if not colls_to_update:
-            return
-
-        for coll in colls_to_update:
+        async for coll in coll_ops.collections.find({}):
            coll_id = coll["_id"]
            try:
                await coll_ops.update_collection_counts_and_tags(coll_id)
--- a/backend/btrixcloud/migrations/migration_0013_crawl_name.py
+++ b/backend/btrixcloud/migrations/migration_0013_crawl_name.py
@ -22,11 +22,7 @@ class Migration(BaseMigration):
        crawls = self.mdb["crawls"]
        crawl_configs = self.mdb["crawl_configs"]

-        configs = [res async for res in crawl_configs.find({"inactive": {"$ne": True}})]
-        if not configs:
-            return
-
-        for config in configs:
+        async for config in crawl_configs.find({"inactive": {"$ne": True}}):
            config_id = config["_id"]
            try:
                if not config.get("name"):
--- a/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py
+++ b/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py
@ -23,25 +23,22 @@ class Migration(BaseMigration):
        mdb_crawls = self.mdb["crawls"]
        mdb_profiles = self.mdb["profiles"]

-        orgs = [res async for res in mdb_orgs.find({})]
-        for org in orgs:
+        async for org in mdb_orgs.find({}):
            oid = org.get("_id")

            bytes_stored = 0

-            crawls = [res async for res in mdb_crawls.find({"oid": oid})]
-            for crawl in crawls:
+            async for crawl in mdb_crawls.find({"oid": oid}):
                for crawl_file in crawl.get("files", []):
                    bytes_stored += crawl_file.get("size", 0)

-            profiles = [res async for res in mdb_profiles.find({"oid": oid})]
-            for profile in profiles:
+            async for profile in mdb_profiles.find({"oid": oid}):
                profile_file = profile.get("resource")
                if profile_file:
                    bytes_stored += profile_file.get("size", 0)

            try:
-                res = await mdb_orgs.find_one_and_update(
+                await mdb_orgs.find_one_and_update(
                    {"_id": oid}, {"$set": {"bytesStored": bytes_stored}}
                )
            # pylint: disable=broad-exception-caught
--- a/backend/btrixcloud/migrations/migration_0016_operator_scheduled_jobs_v2.py
+++ b/backend/btrixcloud/migrations/migration_0016_operator_scheduled_jobs_v2.py
@ -28,8 +28,7 @@ class Migration(BaseMigration):

        # Update configmap for crawl configs that have non-zero timeout or scale > 1
        match_query = {"schedule": {"$nin": ["", None]}}
-        configs_to_update = [res async for res in crawl_configs.find(match_query)]
-        for config_dict in configs_to_update:
+        async for config_dict in crawl_configs.find(match_query):
            config = CrawlConfig.from_dict(config_dict)
            print(
                f"Updating CronJob for Crawl Config {config.id}: schedule: {config.schedule}"
--- a/backend/btrixcloud/migrations/migration_0017_storage_by_type.py
+++ b/backend/btrixcloud/migrations/migration_0017_storage_by_type.py
@ -23,33 +23,24 @@ class Migration(BaseMigration):
        mdb_crawls = self.mdb["crawls"]
        mdb_profiles = self.mdb["profiles"]

-        orgs = [res async for res in mdb_orgs.find({})]
-        for org in orgs:
+        async for org in mdb_orgs.find({}):
            oid = org.get("_id")

            bytes_stored_crawls = 0
            bytes_stored_uploads = 0
            bytes_stored_profiles = 0

-            crawls = [
-                res
-                async for res in mdb_crawls.find(
-                    {"oid": oid, "type": {"$in": [None, "crawl"]}}
-                )
-            ]
-            for crawl in crawls:
+            async for crawl in mdb_crawls.find(
+                {"oid": oid, "type": {"$in": [None, "crawl"]}}
+            ):
                for crawl_file in crawl.get("files", []):
                    bytes_stored_crawls += crawl_file.get("size", 0)

-            uploads = [
-                res async for res in mdb_crawls.find({"oid": oid, "type": "upload"})
-            ]
-            for upload in uploads:
+            async for upload in mdb_crawls.find({"oid": oid, "type": "upload"}):
                for upload_file in upload.get("files", []):
                    bytes_stored_uploads += upload_file.get("size", 0)

-            profiles = [res async for res in mdb_profiles.find({"oid": oid})]
-            for profile in profiles:
+            async for profile in mdb_profiles.find({"oid": oid}):
                profile_file = profile.get("resource")
                if profile_file:
                    bytes_stored_profiles += profile_file.get("size", 0)
@ -59,7 +50,7 @@ class Migration(BaseMigration):
            )

            try:
-                res = await mdb_orgs.find_one_and_update(
+                await mdb_orgs.find_one_and_update(
                    {"_id": oid},
                    {
                        "$set": {
--- a/backend/btrixcloud/orgs.py
+++ b/backend/btrixcloud/orgs.py
@ -349,9 +349,7 @@ class OrgOps:
        upload_count = 0
        page_count = 0

-        cursor = self.crawls_db.find({"oid": org.id})
-        items = await cursor.to_list(length=10_000)
-        for item in items:
+        async for item in self.crawls_db.find({"oid": org.id}):
            if item["state"] not in SUCCESSFUL_STATES:
                continue
            archived_item_count += 1