Precompute crawl file stats (#906)
This commit is contained in:
parent
dd757961fc
commit
120f7ca158
@ -117,6 +117,9 @@ class Crawl(CrawlConfigCore):
|
||||
|
||||
collections: Optional[List[UUID4]] = []
|
||||
|
||||
fileSize: int = 0
|
||||
fileCount: int = 0
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CrawlOut(Crawl):
|
||||
@ -276,8 +279,6 @@ class CrawlOps:
|
||||
# pylint: disable=duplicate-code
|
||||
aggregate = [
|
||||
{"$match": query},
|
||||
{"$set": {"fileSize": {"$sum": "$files.size"}}},
|
||||
{"$set": {"fileCount": {"$size": "$files"}}},
|
||||
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
|
||||
{"$set": {"firstSeed": "$firstSeedObject.url"}},
|
||||
{"$unset": ["firstSeedObject", "errors"]},
|
||||
@ -944,16 +945,35 @@ async def add_crawl_errors(crawls, crawl_id, errors):
|
||||
|
||||
|
||||
# ============================================================================
|
||||
async def add_crawl_file(crawls, crawl_id, crawl_file):
|
||||
async def add_crawl_file(crawls, crawl_id, crawl_file, size):
|
||||
"""add new crawl file to crawl"""
|
||||
await crawls.find_one_and_update(
|
||||
{"_id": crawl_id},
|
||||
{
|
||||
"$push": {"files": crawl_file.dict()},
|
||||
"$inc": {"fileCount": 1, "fileSize": size},
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
async def recompute_crawl_file_count_and_size(crawls, crawl_id):
|
||||
"""Fully recompute file count and size for given crawl"""
|
||||
file_count = 0
|
||||
size = 0
|
||||
|
||||
crawl_raw = await crawls.find_one({"_id": crawl_id})
|
||||
crawl = Crawl.from_dict(crawl_raw)
|
||||
for file_ in crawl.files:
|
||||
file_count += 1
|
||||
size += file_.size
|
||||
|
||||
await crawls.find_one_and_update(
|
||||
{"_id": crawl_id},
|
||||
{"$set": {"fileCount": file_count, "fileSize": size}},
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# pylint: disable=too-many-arguments, too-many-locals, too-many-statements
|
||||
def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep):
|
||||
|
@ -15,7 +15,7 @@ from pymongo.errors import InvalidName
|
||||
from .migrations import BaseMigration
|
||||
|
||||
|
||||
CURR_DB_VERSION = "0007"
|
||||
CURR_DB_VERSION = "0008"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
@ -0,0 +1,36 @@
|
||||
"""
|
||||
Migration 0008 - Precomputing crawl file stats
|
||||
"""
|
||||
from btrixcloud.crawls import recompute_crawl_file_count_and_size
|
||||
from btrixcloud.migrations import BaseMigration
|
||||
|
||||
|
||||
MIGRATION_VERSION = "0008"
|
||||
|
||||
|
||||
class Migration(BaseMigration):
|
||||
"""Migration class."""
|
||||
|
||||
def __init__(self, mdb, migration_version=MIGRATION_VERSION):
|
||||
super().__init__(mdb, migration_version)
|
||||
|
||||
async def migrate_up(self):
|
||||
"""Perform migration up.
|
||||
|
||||
Add data on crawl file count and size to database that was previously
|
||||
dynamically generated in the API endpoints.
|
||||
"""
|
||||
# pylint: disable=duplicate-code
|
||||
crawls = self.mdb["crawls"]
|
||||
|
||||
crawls_to_update = [res async for res in crawls.find({})]
|
||||
if not crawls_to_update:
|
||||
return
|
||||
|
||||
for crawl in crawls_to_update:
|
||||
crawl_id = crawl["_id"]
|
||||
try:
|
||||
await recompute_crawl_file_count_and_size(crawls, crawl_id)
|
||||
# pylint: disable=broad-exception-caught
|
||||
except Exception as err:
|
||||
print(f"Unable to update crawl {crawl_id}: {err}", flush=True)
|
@ -520,7 +520,7 @@ class BtrixOperator(K8sAPI):
|
||||
|
||||
await redis.incr("filesAddedSize", filecomplete.size)
|
||||
|
||||
await add_crawl_file(self.crawls, crawl.id, crawl_file)
|
||||
await add_crawl_file(self.crawls, crawl.id, crawl_file, filecomplete.size)
|
||||
|
||||
return True
|
||||
|
||||
|
@ -80,6 +80,7 @@ def test_crawl_info(admin_auth_headers, default_org_id, admin_crawl_id):
|
||||
)
|
||||
data = r.json()
|
||||
assert data["fileSize"] == wacz_size
|
||||
assert data["fileCount"] == 1
|
||||
assert data["description"] == "Admin Test Crawl description"
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user