Precompute crawl file stats (#906)
This commit is contained in:
		
							parent
							
								
									dd757961fc
								
							
						
					
					
						commit
						120f7ca158
					
				| @ -117,6 +117,9 @@ class Crawl(CrawlConfigCore): | ||||
| 
 | ||||
|     collections: Optional[List[UUID4]] = [] | ||||
| 
 | ||||
|     fileSize: int = 0 | ||||
|     fileCount: int = 0 | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class CrawlOut(Crawl): | ||||
| @ -276,8 +279,6 @@ class CrawlOps: | ||||
|         # pylint: disable=duplicate-code | ||||
|         aggregate = [ | ||||
|             {"$match": query}, | ||||
|             {"$set": {"fileSize": {"$sum": "$files.size"}}}, | ||||
|             {"$set": {"fileCount": {"$size": "$files"}}}, | ||||
|             {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}}, | ||||
|             {"$set": {"firstSeed": "$firstSeedObject.url"}}, | ||||
|             {"$unset": ["firstSeedObject", "errors"]}, | ||||
| @ -944,16 +945,35 @@ async def add_crawl_errors(crawls, crawl_id, errors): | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| async def add_crawl_file(crawls, crawl_id, crawl_file): | ||||
| async def add_crawl_file(crawls, crawl_id, crawl_file, size): | ||||
|     """add new crawl file to crawl""" | ||||
|     await crawls.find_one_and_update( | ||||
|         {"_id": crawl_id}, | ||||
|         { | ||||
|             "$push": {"files": crawl_file.dict()}, | ||||
|             "$inc": {"fileCount": 1, "fileSize": size}, | ||||
|         }, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| async def recompute_crawl_file_count_and_size(crawls, crawl_id): | ||||
|     """Fully recompute file count and size for given crawl""" | ||||
|     file_count = 0 | ||||
|     size = 0 | ||||
| 
 | ||||
|     crawl_raw = await crawls.find_one({"_id": crawl_id}) | ||||
|     crawl = Crawl.from_dict(crawl_raw) | ||||
|     for file_ in crawl.files: | ||||
|         file_count += 1 | ||||
|         size += file_.size | ||||
| 
 | ||||
|     await crawls.find_one_and_update( | ||||
|         {"_id": crawl_id}, | ||||
|         {"$set": {"fileCount": file_count, "fileSize": size}}, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| # pylint: disable=too-many-arguments, too-many-locals, too-many-statements | ||||
| def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep): | ||||
|  | ||||
| @ -15,7 +15,7 @@ from pymongo.errors import InvalidName | ||||
| from .migrations import BaseMigration | ||||
| 
 | ||||
| 
 | ||||
| CURR_DB_VERSION = "0007" | ||||
| CURR_DB_VERSION = "0008" | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
|  | ||||
| @ -0,0 +1,36 @@ | ||||
| """ | ||||
| Migration 0008 - Precomputing crawl file stats | ||||
| """ | ||||
| from btrixcloud.crawls import recompute_crawl_file_count_and_size | ||||
| from btrixcloud.migrations import BaseMigration | ||||
| 
 | ||||
| 
 | ||||
| MIGRATION_VERSION = "0008" | ||||
| 
 | ||||
| 
 | ||||
| class Migration(BaseMigration): | ||||
|     """Migration class.""" | ||||
| 
 | ||||
|     def __init__(self, mdb, migration_version=MIGRATION_VERSION): | ||||
|         super().__init__(mdb, migration_version) | ||||
| 
 | ||||
|     async def migrate_up(self): | ||||
|         """Perform migration up. | ||||
| 
 | ||||
|         Add data on crawl file count and size to database that was previously | ||||
|         dynamically generated in the API endpoints. | ||||
|         """ | ||||
|         # pylint: disable=duplicate-code | ||||
|         crawls = self.mdb["crawls"] | ||||
| 
 | ||||
|         crawls_to_update = [res async for res in crawls.find({})] | ||||
|         if not crawls_to_update: | ||||
|             return | ||||
| 
 | ||||
|         for crawl in crawls_to_update: | ||||
|             crawl_id = crawl["_id"] | ||||
|             try: | ||||
|                 await recompute_crawl_file_count_and_size(crawls, crawl_id) | ||||
|             # pylint: disable=broad-exception-caught | ||||
|             except Exception as err: | ||||
|                 print(f"Unable to update crawl {crawl_id}: {err}", flush=True) | ||||
| @ -520,7 +520,7 @@ class BtrixOperator(K8sAPI): | ||||
| 
 | ||||
|         await redis.incr("filesAddedSize", filecomplete.size) | ||||
| 
 | ||||
|         await add_crawl_file(self.crawls, crawl.id, crawl_file) | ||||
|         await add_crawl_file(self.crawls, crawl.id, crawl_file, filecomplete.size) | ||||
| 
 | ||||
|         return True | ||||
| 
 | ||||
|  | ||||
| @ -80,6 +80,7 @@ def test_crawl_info(admin_auth_headers, default_org_id, admin_crawl_id): | ||||
|     ) | ||||
|     data = r.json() | ||||
|     assert data["fileSize"] == wacz_size | ||||
|     assert data["fileCount"] == 1 | ||||
|     assert data["description"] == "Admin Test Crawl description" | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user