Precompute crawl file stats (#906)
This commit is contained in:
parent
dd757961fc
commit
120f7ca158
@ -117,6 +117,9 @@ class Crawl(CrawlConfigCore):
|
|||||||
|
|
||||||
collections: Optional[List[UUID4]] = []
|
collections: Optional[List[UUID4]] = []
|
||||||
|
|
||||||
|
fileSize: int = 0
|
||||||
|
fileCount: int = 0
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class CrawlOut(Crawl):
|
class CrawlOut(Crawl):
|
||||||
@ -276,8 +279,6 @@ class CrawlOps:
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
aggregate = [
|
aggregate = [
|
||||||
{"$match": query},
|
{"$match": query},
|
||||||
{"$set": {"fileSize": {"$sum": "$files.size"}}},
|
|
||||||
{"$set": {"fileCount": {"$size": "$files"}}},
|
|
||||||
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
|
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
|
||||||
{"$set": {"firstSeed": "$firstSeedObject.url"}},
|
{"$set": {"firstSeed": "$firstSeedObject.url"}},
|
||||||
{"$unset": ["firstSeedObject", "errors"]},
|
{"$unset": ["firstSeedObject", "errors"]},
|
||||||
@ -944,16 +945,35 @@ async def add_crawl_errors(crawls, crawl_id, errors):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
async def add_crawl_file(crawls, crawl_id, crawl_file):
|
async def add_crawl_file(crawls, crawl_id, crawl_file, size):
|
||||||
"""add new crawl file to crawl"""
|
"""add new crawl file to crawl"""
|
||||||
await crawls.find_one_and_update(
|
await crawls.find_one_and_update(
|
||||||
{"_id": crawl_id},
|
{"_id": crawl_id},
|
||||||
{
|
{
|
||||||
"$push": {"files": crawl_file.dict()},
|
"$push": {"files": crawl_file.dict()},
|
||||||
|
"$inc": {"fileCount": 1, "fileSize": size},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
async def recompute_crawl_file_count_and_size(crawls, crawl_id):
|
||||||
|
"""Fully recompute file count and size for given crawl"""
|
||||||
|
file_count = 0
|
||||||
|
size = 0
|
||||||
|
|
||||||
|
crawl_raw = await crawls.find_one({"_id": crawl_id})
|
||||||
|
crawl = Crawl.from_dict(crawl_raw)
|
||||||
|
for file_ in crawl.files:
|
||||||
|
file_count += 1
|
||||||
|
size += file_.size
|
||||||
|
|
||||||
|
await crawls.find_one_and_update(
|
||||||
|
{"_id": crawl_id},
|
||||||
|
{"$set": {"fileCount": file_count, "fileSize": size}},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# pylint: disable=too-many-arguments, too-many-locals, too-many-statements
|
# pylint: disable=too-many-arguments, too-many-locals, too-many-statements
|
||||||
def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep):
|
def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep):
|
||||||
|
@ -15,7 +15,7 @@ from pymongo.errors import InvalidName
|
|||||||
from .migrations import BaseMigration
|
from .migrations import BaseMigration
|
||||||
|
|
||||||
|
|
||||||
CURR_DB_VERSION = "0007"
|
CURR_DB_VERSION = "0008"
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
@ -0,0 +1,36 @@
|
|||||||
|
"""
|
||||||
|
Migration 0008 - Precomputing crawl file stats
|
||||||
|
"""
|
||||||
|
from btrixcloud.crawls import recompute_crawl_file_count_and_size
|
||||||
|
from btrixcloud.migrations import BaseMigration
|
||||||
|
|
||||||
|
|
||||||
|
MIGRATION_VERSION = "0008"
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(BaseMigration):
|
||||||
|
"""Migration class."""
|
||||||
|
|
||||||
|
def __init__(self, mdb, migration_version=MIGRATION_VERSION):
|
||||||
|
super().__init__(mdb, migration_version)
|
||||||
|
|
||||||
|
async def migrate_up(self):
|
||||||
|
"""Perform migration up.
|
||||||
|
|
||||||
|
Add data on crawl file count and size to database that was previously
|
||||||
|
dynamically generated in the API endpoints.
|
||||||
|
"""
|
||||||
|
# pylint: disable=duplicate-code
|
||||||
|
crawls = self.mdb["crawls"]
|
||||||
|
|
||||||
|
crawls_to_update = [res async for res in crawls.find({})]
|
||||||
|
if not crawls_to_update:
|
||||||
|
return
|
||||||
|
|
||||||
|
for crawl in crawls_to_update:
|
||||||
|
crawl_id = crawl["_id"]
|
||||||
|
try:
|
||||||
|
await recompute_crawl_file_count_and_size(crawls, crawl_id)
|
||||||
|
# pylint: disable=broad-exception-caught
|
||||||
|
except Exception as err:
|
||||||
|
print(f"Unable to update crawl {crawl_id}: {err}", flush=True)
|
@ -520,7 +520,7 @@ class BtrixOperator(K8sAPI):
|
|||||||
|
|
||||||
await redis.incr("filesAddedSize", filecomplete.size)
|
await redis.incr("filesAddedSize", filecomplete.size)
|
||||||
|
|
||||||
await add_crawl_file(self.crawls, crawl.id, crawl_file)
|
await add_crawl_file(self.crawls, crawl.id, crawl_file, filecomplete.size)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -80,6 +80,7 @@ def test_crawl_info(admin_auth_headers, default_org_id, admin_crawl_id):
|
|||||||
)
|
)
|
||||||
data = r.json()
|
data = r.json()
|
||||||
assert data["fileSize"] == wacz_size
|
assert data["fileSize"] == wacz_size
|
||||||
|
assert data["fileCount"] == 1
|
||||||
assert data["description"] == "Admin Test Crawl description"
|
assert data["description"] == "Admin Test Crawl description"
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user