Precompute crawl file stats (#906)

This commit is contained in:
Tessa Walsh 2023-06-07 19:39:49 -04:00 committed by GitHub
parent dd757961fc
commit 120f7ca158
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 62 additions and 5 deletions

View File

@ -117,6 +117,9 @@ class Crawl(CrawlConfigCore):
collections: Optional[List[UUID4]] = []
fileSize: int = 0
fileCount: int = 0
# ============================================================================
class CrawlOut(Crawl):
@ -276,8 +279,6 @@ class CrawlOps:
# pylint: disable=duplicate-code
aggregate = [
{"$match": query},
{"$set": {"fileSize": {"$sum": "$files.size"}}},
{"$set": {"fileCount": {"$size": "$files"}}},
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
{"$set": {"firstSeed": "$firstSeedObject.url"}},
{"$unset": ["firstSeedObject", "errors"]},
@ -944,16 +945,35 @@ async def add_crawl_errors(crawls, crawl_id, errors):
# ============================================================================
async def add_crawl_file(crawls, crawl_id, crawl_file):
async def add_crawl_file(crawls, crawl_id, crawl_file, size):
"""add new crawl file to crawl"""
await crawls.find_one_and_update(
{"_id": crawl_id},
{
"$push": {"files": crawl_file.dict()},
"$inc": {"fileCount": 1, "fileSize": size},
},
)
# ============================================================================
async def recompute_crawl_file_count_and_size(crawls, crawl_id):
"""Fully recompute file count and size for given crawl"""
file_count = 0
size = 0
crawl_raw = await crawls.find_one({"_id": crawl_id})
crawl = Crawl.from_dict(crawl_raw)
for file_ in crawl.files:
file_count += 1
size += file_.size
await crawls.find_one_and_update(
{"_id": crawl_id},
{"$set": {"fileCount": file_count, "fileSize": size}},
)
# ============================================================================
# pylint: disable=too-many-arguments, too-many-locals, too-many-statements
def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep):

View File

@ -15,7 +15,7 @@ from pymongo.errors import InvalidName
from .migrations import BaseMigration
CURR_DB_VERSION = "0007"
CURR_DB_VERSION = "0008"
# ============================================================================

View File

@ -0,0 +1,36 @@
"""
Migration 0008 - Precomputing crawl file stats
"""
from btrixcloud.crawls import recompute_crawl_file_count_and_size
from btrixcloud.migrations import BaseMigration
MIGRATION_VERSION = "0008"
class Migration(BaseMigration):
"""Migration class."""
def __init__(self, mdb, migration_version=MIGRATION_VERSION):
super().__init__(mdb, migration_version)
async def migrate_up(self):
"""Perform migration up.
Add data on crawl file count and size to database that was previously
dynamically generated in the API endpoints.
"""
# pylint: disable=duplicate-code
crawls = self.mdb["crawls"]
crawls_to_update = [res async for res in crawls.find({})]
if not crawls_to_update:
return
for crawl in crawls_to_update:
crawl_id = crawl["_id"]
try:
await recompute_crawl_file_count_and_size(crawls, crawl_id)
# pylint: disable=broad-exception-caught
except Exception as err:
print(f"Unable to update crawl {crawl_id}: {err}", flush=True)

View File

@ -520,7 +520,7 @@ class BtrixOperator(K8sAPI):
await redis.incr("filesAddedSize", filecomplete.size)
await add_crawl_file(self.crawls, crawl.id, crawl_file)
await add_crawl_file(self.crawls, crawl.id, crawl_file, filecomplete.size)
return True

View File

@ -80,6 +80,7 @@ def test_crawl_info(admin_auth_headers, default_org_id, admin_crawl_id):
)
data = r.json()
assert data["fileSize"] == wacz_size
assert data["fileCount"] == 1
assert data["description"] == "Admin Test Crawl description"