diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 25b7cf30..c35081c4 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -322,9 +322,7 @@ class CollectionOps: total_size = 0 tags = [] - cursor = self.crawls.find({"collectionIds": collection_id}) - crawls = await cursor.to_list(length=10_000) - for crawl in crawls: + async for crawl in self.crawls.find({"collectionIds": collection_id}): if crawl["state"] not in SUCCESSFUL_STATES: continue crawl_count += 1 diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 4724e0d4..84138296 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -605,7 +605,7 @@ class CrawlConfigOps: total = await self.config_revs.count_documents(match_query) cursor = self.config_revs.find({"cid": cid}, skip=skip, limit=page_size) - results = await cursor.to_list(length=1000) + results = await cursor.to_list(length=page_size) revisions = [ConfigRevision.from_dict(res) for res in results] return revisions, total @@ -704,8 +704,7 @@ class CrawlConfigOps: descriptions = [description for description in descriptions if description] first_seeds = set() - configs = [config async for config in self.crawl_configs.find({"oid": org.id})] - for config in configs: + async for config in self.crawl_configs.find({"oid": org.id}): first_seed = config["config"]["seeds"][0]["url"] first_seeds.add(first_seed) @@ -802,38 +801,46 @@ async def stats_recompute_all(crawl_configs, crawls, cid: uuid.UUID): } match_query = {"cid": cid, "finished": {"$ne": None}} - cursor = crawls.find(match_query).sort("finished", pymongo.DESCENDING) - results = await cursor.to_list(length=10_000) - if results: - update_query["crawlCount"] = len(results) - - update_query["crawlSuccessfulCount"] = len( - [res for res in results if res["state"] not in FAILED_STATES] - ) - - last_crawl = results[0] - - last_crawl_finished = last_crawl.get("finished") - - update_query["lastCrawlId"] = str(last_crawl.get("_id")) - update_query["lastCrawlStartTime"] = last_crawl.get("started") - update_query["lastStartedBy"] = last_crawl.get("userid") - update_query["lastStartedByName"] = last_crawl.get("userName") - update_query["lastCrawlTime"] = last_crawl_finished - update_query["lastCrawlState"] = last_crawl.get("state") - update_query["lastCrawlSize"] = sum( - file_.get("size", 0) for file_ in last_crawl.get("files", []) - ) - - if last_crawl_finished: - update_query["lastRun"] = last_crawl_finished + count = await crawls.count_documents(match_query) + if count: + update_query["crawlCount"] = count total_size = 0 - for res in results: + successful_count = 0 + + last_crawl: Optional[dict[str, object]] = None + last_crawl_size = 0 + + async for res in crawls.find(match_query).sort("finished", pymongo.DESCENDING): files = res.get("files", []) + crawl_size = 0 for file in files: - total_size += file.get("size", 0) - update_query["totalSize"] = total_size + crawl_size += file.get("size", 0) + + total_size += crawl_size + + if res["state"] not in FAILED_STATES: + successful_count += 1 + + last_crawl = res + last_crawl_size = crawl_size + + if last_crawl: + update_query["totalSize"] = total_size + update_query["crawlSuccessfulCount"] = successful_count + + update_query["lastCrawlId"] = str(last_crawl.get("_id")) + update_query["lastCrawlStartTime"] = last_crawl.get("started") + update_query["lastStartedBy"] = last_crawl.get("userid") + update_query["lastStartedByName"] = last_crawl.get("userName") + update_query["lastCrawlState"] = last_crawl.get("state") + update_query["lastCrawlSize"] = last_crawl_size + + last_crawl_finished = last_crawl.get("finished") + update_query["lastCrawlTime"] = last_crawl_finished + + if last_crawl_finished: + update_query["lastRun"] = last_crawl_finished result = await crawl_configs.find_one_and_update( {"_id": cid, "inactive": {"$ne": True}}, diff --git a/backend/btrixcloud/migrations/migration_0003_mutable_crawl_configs.py b/backend/btrixcloud/migrations/migration_0003_mutable_crawl_configs.py index aae4a20e..f773316c 100644 --- a/backend/btrixcloud/migrations/migration_0003_mutable_crawl_configs.py +++ b/backend/btrixcloud/migrations/migration_0003_mutable_crawl_configs.py @@ -26,8 +26,7 @@ class Migration(BaseMigration): crawl_configs = self.mdb["crawl_configs"] # Return early if there are no configs - crawl_config_results = [res async for res in crawl_configs.find({})] - if not crawl_config_results: + if not await crawl_configs.count_documents({}): return utc_now_datetime = datetime.utcnow().replace(microsecond=0, tzinfo=None) @@ -45,14 +44,9 @@ class Migration(BaseMigration): await crawl_configs.update_many({}, [{"$set": {"modified": "$created"}}]) await crawl_configs.update_many({}, {"$set": {"rev": 0}}) - # Return early if there are no crawls - crawl_results = [res async for res in crawls.find({})] - if not crawl_results: - return - await crawls.update_many({}, {"$set": {"cid_rev": 0}}) - for crawl_result in crawl_results: + async for crawl_result in crawls.find({}): config_result = await crawl_configs.find_one({"_id": crawl_result["cid"]}) if not config_result: continue diff --git a/backend/btrixcloud/migrations/migration_0004_config_seeds.py b/backend/btrixcloud/migrations/migration_0004_config_seeds.py index 4586e313..a4e9f66e 100644 --- a/backend/btrixcloud/migrations/migration_0004_config_seeds.py +++ b/backend/btrixcloud/migrations/migration_0004_config_seeds.py @@ -25,11 +25,8 @@ class Migration(BaseMigration): # Migrate workflows crawl_configs = self.mdb["crawl_configs"] - crawl_config_results = [res async for res in crawl_configs.find({})] - if not crawl_config_results: - return - for config_dict in crawl_config_results: + async for config_dict in crawl_configs.find({}): seeds_to_migrate = [] seed_dicts = [] @@ -65,9 +62,8 @@ class Migration(BaseMigration): # Migrate seeds copied into crawls crawls = self.mdb["crawls"] - crawl_results = [res async for res in crawls.find({})] - for crawl_dict in crawl_results: + async for crawl_dict in crawls.find({}): seeds_to_migrate = [] seed_dicts = [] @@ -102,15 +98,13 @@ class Migration(BaseMigration): ) # Test migration - crawl_config_results = [res async for res in crawl_configs.find({})] - for config_dict in crawl_config_results: + async for config_dict in crawl_configs.find({}): config = CrawlConfig.from_dict(config_dict) for seed in config.config.seeds: assert isinstance(seed, Seed) assert seed.url - crawl_results = [res async for res in crawls.find({})] - for crawl_dict in crawl_results: + async for crawl_dict in crawls.find({}): crawl = Crawl.from_dict(crawl_dict) for seed in crawl.config.seeds: assert isinstance(seed, Seed) diff --git a/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py b/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py index 426f0134..7134f63c 100644 --- a/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py +++ b/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py @@ -37,8 +37,7 @@ class Migration(BaseMigration): {"schedule": {"$nin": ["", None]}}, ] } - configs_to_update = [res async for res in crawl_configs.find(match_query)] - for config_dict in configs_to_update: + async for config_dict in crawl_configs.find(match_query): config = CrawlConfig.from_dict(config_dict) print( f"Updating Crawl Config {config.id}: schedule: {config.schedule}, " diff --git a/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py b/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py index 007ca827..f920bd64 100644 --- a/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py +++ b/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py @@ -24,11 +24,7 @@ class Migration(BaseMigration): crawl_configs = self.mdb["crawl_configs"] crawls = self.mdb["crawls"] - configs = [res async for res in crawl_configs.find({"inactive": {"$ne": True}})] - if not configs: - return - - for config in configs: + async for config in crawl_configs.find({"inactive": {"$ne": True}}): config_id = config["_id"] try: await stats_recompute_all(crawl_configs, crawls, config_id) diff --git a/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py b/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py index 99a2b276..ec5d2e94 100644 --- a/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py +++ b/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py @@ -24,11 +24,7 @@ class Migration(BaseMigration): crawls = self.mdb["crawls"] # Update workflows crawl stats to populate crawlSuccessfulCount - configs = [res async for res in crawl_configs.find({"inactive": {"$ne": True}})] - if not configs: - return - - for config in configs: + async for config in crawl_configs.find({"inactive": {"$ne": True}}): config_id = config["_id"] try: await stats_recompute_all(crawl_configs, crawls, config_id) diff --git a/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py b/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py index 1ee6392c..1bc8db2a 100644 --- a/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py +++ b/backend/btrixcloud/migrations/migration_0008_precompute_crawl_file_stats.py @@ -23,11 +23,7 @@ class Migration(BaseMigration): # pylint: disable=duplicate-code crawls = self.mdb["crawls"] - crawls_to_update = [res async for res in crawls.find({})] - if not crawls_to_update: - return - - for crawl in crawls_to_update: + async for crawl in crawls.find({}): crawl_id = crawl["_id"] try: await recompute_crawl_file_count_and_size(crawls, crawl_id) diff --git a/backend/btrixcloud/migrations/migration_0010_collection_total_size.py b/backend/btrixcloud/migrations/migration_0010_collection_total_size.py index c0b0ad96..e2a2eb3d 100644 --- a/backend/btrixcloud/migrations/migration_0010_collection_total_size.py +++ b/backend/btrixcloud/migrations/migration_0010_collection_total_size.py @@ -22,11 +22,7 @@ class Migration(BaseMigration): # pylint: disable=duplicate-code coll_ops = CollectionOps(self.mdb, None, None, None) - colls_to_update = [res async for res in coll_ops.collections.find({})] - if not colls_to_update: - return - - for coll in colls_to_update: + async for coll in coll_ops.collections.find({}): coll_id = coll["_id"] try: await coll_ops.update_collection_counts_and_tags(coll_id) diff --git a/backend/btrixcloud/migrations/migration_0013_crawl_name.py b/backend/btrixcloud/migrations/migration_0013_crawl_name.py index 9b511101..3c120f88 100644 --- a/backend/btrixcloud/migrations/migration_0013_crawl_name.py +++ b/backend/btrixcloud/migrations/migration_0013_crawl_name.py @@ -22,11 +22,7 @@ class Migration(BaseMigration): crawls = self.mdb["crawls"] crawl_configs = self.mdb["crawl_configs"] - configs = [res async for res in crawl_configs.find({"inactive": {"$ne": True}})] - if not configs: - return - - for config in configs: + async for config in crawl_configs.find({"inactive": {"$ne": True}}): config_id = config["_id"] try: if not config.get("name"): diff --git a/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py b/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py index a70d7e3d..c1582ee4 100644 --- a/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py +++ b/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py @@ -23,25 +23,22 @@ class Migration(BaseMigration): mdb_crawls = self.mdb["crawls"] mdb_profiles = self.mdb["profiles"] - orgs = [res async for res in mdb_orgs.find({})] - for org in orgs: + async for org in mdb_orgs.find({}): oid = org.get("_id") bytes_stored = 0 - crawls = [res async for res in mdb_crawls.find({"oid": oid})] - for crawl in crawls: + async for crawl in mdb_crawls.find({"oid": oid}): for crawl_file in crawl.get("files", []): bytes_stored += crawl_file.get("size", 0) - profiles = [res async for res in mdb_profiles.find({"oid": oid})] - for profile in profiles: + async for profile in mdb_profiles.find({"oid": oid}): profile_file = profile.get("resource") if profile_file: bytes_stored += profile_file.get("size", 0) try: - res = await mdb_orgs.find_one_and_update( + await mdb_orgs.find_one_and_update( {"_id": oid}, {"$set": {"bytesStored": bytes_stored}} ) # pylint: disable=broad-exception-caught diff --git a/backend/btrixcloud/migrations/migration_0016_operator_scheduled_jobs_v2.py b/backend/btrixcloud/migrations/migration_0016_operator_scheduled_jobs_v2.py index 730a13a4..b1cd648e 100644 --- a/backend/btrixcloud/migrations/migration_0016_operator_scheduled_jobs_v2.py +++ b/backend/btrixcloud/migrations/migration_0016_operator_scheduled_jobs_v2.py @@ -28,8 +28,7 @@ class Migration(BaseMigration): # Update configmap for crawl configs that have non-zero timeout or scale > 1 match_query = {"schedule": {"$nin": ["", None]}} - configs_to_update = [res async for res in crawl_configs.find(match_query)] - for config_dict in configs_to_update: + async for config_dict in crawl_configs.find(match_query): config = CrawlConfig.from_dict(config_dict) print( f"Updating CronJob for Crawl Config {config.id}: schedule: {config.schedule}" diff --git a/backend/btrixcloud/migrations/migration_0017_storage_by_type.py b/backend/btrixcloud/migrations/migration_0017_storage_by_type.py index 024cf255..ae2b6e6f 100644 --- a/backend/btrixcloud/migrations/migration_0017_storage_by_type.py +++ b/backend/btrixcloud/migrations/migration_0017_storage_by_type.py @@ -23,33 +23,24 @@ class Migration(BaseMigration): mdb_crawls = self.mdb["crawls"] mdb_profiles = self.mdb["profiles"] - orgs = [res async for res in mdb_orgs.find({})] - for org in orgs: + async for org in mdb_orgs.find({}): oid = org.get("_id") bytes_stored_crawls = 0 bytes_stored_uploads = 0 bytes_stored_profiles = 0 - crawls = [ - res - async for res in mdb_crawls.find( - {"oid": oid, "type": {"$in": [None, "crawl"]}} - ) - ] - for crawl in crawls: + async for crawl in mdb_crawls.find( + {"oid": oid, "type": {"$in": [None, "crawl"]}} + ): for crawl_file in crawl.get("files", []): bytes_stored_crawls += crawl_file.get("size", 0) - uploads = [ - res async for res in mdb_crawls.find({"oid": oid, "type": "upload"}) - ] - for upload in uploads: + async for upload in mdb_crawls.find({"oid": oid, "type": "upload"}): for upload_file in upload.get("files", []): bytes_stored_uploads += upload_file.get("size", 0) - profiles = [res async for res in mdb_profiles.find({"oid": oid})] - for profile in profiles: + async for profile in mdb_profiles.find({"oid": oid}): profile_file = profile.get("resource") if profile_file: bytes_stored_profiles += profile_file.get("size", 0) @@ -59,7 +50,7 @@ class Migration(BaseMigration): ) try: - res = await mdb_orgs.find_one_and_update( + await mdb_orgs.find_one_and_update( {"_id": oid}, { "$set": { diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index a4cefc0c..64934ad9 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -349,9 +349,7 @@ class OrgOps: upload_count = 0 page_count = 0 - cursor = self.crawls_db.find({"oid": org.id}) - items = await cursor.to_list(length=10_000) - for item in items: + async for item in self.crawls_db.find({"oid": org.id}): if item["state"] not in SUCCESSFUL_STATES: continue archived_item_count += 1