Pre-compute workflow last crawl information (#812)
* Precompute config crawl stats * Includes a database migration to move preciously dynamically computed crawl stats for workflows into the CrawlConfig model. * Add crawls.finished descending index * Add last crawl fields to workflow tests
This commit is contained in:
parent
9fcbc3f87e
commit
8281ba723e
@ -178,6 +178,16 @@ class CrawlConfig(CrawlConfigCore):
|
||||
|
||||
rev: int = 0
|
||||
|
||||
crawlCount: Optional[int] = 0
|
||||
totalSize: Optional[int] = 0
|
||||
|
||||
lastCrawlId: Optional[str]
|
||||
lastCrawlStartTime: Optional[datetime]
|
||||
lastStartedBy: Optional[UUID4]
|
||||
lastCrawlTime: Optional[datetime]
|
||||
lastCrawlState: Optional[str]
|
||||
lastCrawlSize: Optional[int]
|
||||
|
||||
def get_raw_config(self):
|
||||
"""serialize config for browsertrix-crawler"""
|
||||
return self.config.dict(exclude_unset=True, exclude_none=True)
|
||||
@ -200,15 +210,6 @@ class CrawlConfigOut(CrawlConfig):
|
||||
|
||||
firstSeed: Optional[str]
|
||||
|
||||
totalSize: Optional[int] = 0
|
||||
|
||||
crawlCount: Optional[int] = 0
|
||||
lastCrawlId: Optional[str]
|
||||
lastCrawlStartTime: Optional[datetime]
|
||||
lastCrawlTime: Optional[datetime]
|
||||
lastCrawlState: Optional[str]
|
||||
lastCrawlSize: Optional[int]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CrawlConfigIdNameOut(BaseMongoModel):
|
||||
@ -241,6 +242,7 @@ class CrawlConfigOps:
|
||||
|
||||
def __init__(self, dbclient, mdb, user_manager, org_ops, crawl_manager, profiles):
|
||||
self.dbclient = dbclient
|
||||
self.crawls = mdb["crawls"]
|
||||
self.crawl_configs = mdb["crawl_configs"]
|
||||
self.config_revs = mdb["configs_revs"]
|
||||
self.user_manager = user_manager
|
||||
@ -264,7 +266,7 @@ class CrawlConfigOps:
|
||||
self.crawl_ops = ops
|
||||
|
||||
async def init_index(self):
|
||||
"""init index for crawls db"""
|
||||
"""init index for crawlconfigs db collection"""
|
||||
await self.crawl_configs.create_index(
|
||||
[("oid", pymongo.HASHED), ("inactive", pymongo.ASCENDING)]
|
||||
)
|
||||
@ -509,84 +511,6 @@ class CrawlConfigOps:
|
||||
# Set firstSeed
|
||||
{"$set": {"firstSeed": "$firstSeedObject.url"}},
|
||||
{"$unset": ["firstSeedObject"]},
|
||||
{
|
||||
"$lookup": {
|
||||
"from": "crawls",
|
||||
"localField": "_id",
|
||||
"foreignField": "cid",
|
||||
"as": "configCrawls",
|
||||
},
|
||||
},
|
||||
# Filter workflow crawls on finished and active
|
||||
{
|
||||
"$set": {
|
||||
"finishedCrawls": {
|
||||
"$filter": {
|
||||
"input": "$configCrawls",
|
||||
"as": "filterCrawls",
|
||||
"cond": {
|
||||
"$and": [
|
||||
{"$ne": ["$$filterCrawls.finished", None]},
|
||||
{"$ne": ["$$filterCrawls.inactive", True]},
|
||||
]
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
# Set crawl count to number of finished crawls
|
||||
{"$set": {"crawlCount": {"$size": "$finishedCrawls"}}},
|
||||
# Sort finished crawls by finished time descending to get latest
|
||||
{
|
||||
"$set": {
|
||||
"sortedCrawls": {
|
||||
"$sortArray": {
|
||||
"input": "$finishedCrawls",
|
||||
"sortBy": {"finished": -1},
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{"$unset": ["finishedCrawls"]},
|
||||
{"$set": {"lastCrawl": {"$arrayElemAt": ["$sortedCrawls", 0]}}},
|
||||
{"$set": {"lastCrawlId": "$lastCrawl._id"}},
|
||||
{"$set": {"lastCrawlStartTime": "$lastCrawl.started"}},
|
||||
{"$set": {"lastCrawlTime": "$lastCrawl.finished"}},
|
||||
{"$set": {"lastCrawlState": "$lastCrawl.state"}},
|
||||
# Get userid of last started crawl
|
||||
{"$set": {"lastStartedBy": "$lastCrawl.userid"}},
|
||||
{"$set": {"lastCrawlSize": {"$sum": "$lastCrawl.files.size"}}},
|
||||
{
|
||||
"$lookup": {
|
||||
"from": "users",
|
||||
"localField": "lastStartedBy",
|
||||
"foreignField": "id",
|
||||
"as": "lastStartedByName",
|
||||
},
|
||||
},
|
||||
{
|
||||
"$set": {
|
||||
"lastStartedByName": {
|
||||
"$arrayElemAt": ["$lastStartedByName.name", 0]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$set": {
|
||||
"totalSize": {
|
||||
"$sum": {
|
||||
"$map": {
|
||||
"input": "$sortedCrawls.files",
|
||||
"as": "crawlFile",
|
||||
"in": {"$arrayElemAt": ["$$crawlFile.size", 0]},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
# unset
|
||||
{"$unset": ["lastCrawl"]},
|
||||
{"$unset": ["sortedCrawls"]},
|
||||
]
|
||||
|
||||
if first_seed:
|
||||
@ -611,6 +535,19 @@ class CrawlConfigOps:
|
||||
},
|
||||
},
|
||||
{"$set": {"createdByName": {"$arrayElemAt": ["$userName.name", 0]}}},
|
||||
{
|
||||
"$lookup": {
|
||||
"from": "users",
|
||||
"localField": "lastStartedBy",
|
||||
"foreignField": "id",
|
||||
"as": "startedName",
|
||||
},
|
||||
},
|
||||
{
|
||||
"$set": {
|
||||
"lastStartedByName": {"$arrayElemAt": ["$startedName.name", 0]}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$lookup": {
|
||||
"from": "users",
|
||||
@ -648,22 +585,12 @@ class CrawlConfigOps:
|
||||
except (IndexError, ValueError):
|
||||
total = 0
|
||||
|
||||
# crawls = await self.crawl_manager.list_running_crawls(oid=org.id)
|
||||
crawls, _ = await self.crawl_ops.list_crawls(
|
||||
org=org,
|
||||
running_only=True,
|
||||
# Set high so that when we lower default we still get all running crawls
|
||||
page_size=1_000,
|
||||
)
|
||||
running = {}
|
||||
for crawl in crawls:
|
||||
running[crawl.cid] = crawl
|
||||
|
||||
configs = []
|
||||
for res in items:
|
||||
config = CrawlConfigOut.from_dict(res)
|
||||
# pylint: disable=invalid-name
|
||||
self._add_curr_crawl_stats(config, running.get(config.id))
|
||||
if not config.inactive:
|
||||
self._add_curr_crawl_stats(config, await self.get_running_crawl(config))
|
||||
configs.append(config)
|
||||
|
||||
return configs, total
|
||||
@ -693,20 +620,9 @@ class CrawlConfigOps:
|
||||
|
||||
return None
|
||||
|
||||
async def _annotate_with_crawl_stats(self, crawlconfig: CrawlConfigOut):
|
||||
"""Annotate crawlconfig with information about associated crawls"""
|
||||
crawl_stats = await self.crawl_ops.get_latest_crawl_and_count_by_config(
|
||||
cid=crawlconfig.id
|
||||
)
|
||||
crawlconfig.crawlCount = crawl_stats["crawl_count"]
|
||||
crawlconfig.totalSize = crawl_stats["total_size"]
|
||||
crawlconfig.lastCrawlId = crawl_stats["last_crawl_id"]
|
||||
crawlconfig.lastCrawlStartTime = crawl_stats["last_crawl_started"]
|
||||
crawlconfig.lastCrawlTime = crawl_stats["last_crawl_finished"]
|
||||
crawlconfig.lastStartedByName = crawl_stats["last_started_by"]
|
||||
crawlconfig.lastCrawlState = crawl_stats["last_crawl_state"]
|
||||
crawlconfig.lastCrawlSize = crawl_stats["last_crawl_size"]
|
||||
return crawlconfig
|
||||
async def update_crawl_stats(self, cid: uuid.UUID):
|
||||
"""Update crawl count, total size, and last crawl information for config."""
|
||||
await update_config_crawl_stats(self.crawl_configs, self.crawls, cid)
|
||||
|
||||
def _add_curr_crawl_stats(self, crawlconfig, crawl):
|
||||
"""Add stats from current running crawl, if any"""
|
||||
@ -745,13 +661,17 @@ class CrawlConfigOps:
|
||||
if modified_user:
|
||||
crawlconfig.modifiedByName = modified_user.name
|
||||
|
||||
if crawlconfig.lastStartedBy:
|
||||
last_started_user = await self.user_manager.get(crawlconfig.lastStartedBy)
|
||||
# pylint: disable=invalid-name
|
||||
if last_started_user:
|
||||
crawlconfig.lastStartedByName = last_started_user.name
|
||||
|
||||
if crawlconfig.profileid:
|
||||
crawlconfig.profileName = await self.profiles.get_profile_name(
|
||||
crawlconfig.profileid, org
|
||||
)
|
||||
|
||||
crawlconfig = await self._annotate_with_crawl_stats(crawlconfig)
|
||||
|
||||
return crawlconfig
|
||||
|
||||
async def get_crawl_config(
|
||||
@ -957,6 +877,58 @@ async def inc_crawl_count(crawl_configs, cid: uuid.UUID):
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# pylint: disable=too-many-locals
|
||||
async def update_config_crawl_stats(crawl_configs, crawls, cid: uuid.UUID):
|
||||
"""re-calculate and update crawl statistics for config"""
|
||||
update_query = {
|
||||
"crawlCount": 0,
|
||||
"totalSize": 0,
|
||||
"lastCrawlId": None,
|
||||
"lastCrawlStartTime": None,
|
||||
"lastStartedBy": None,
|
||||
"lastCrawlTime": None,
|
||||
"lastCrawlState": None,
|
||||
"lastCrawlSize": None,
|
||||
}
|
||||
|
||||
match_query = {"cid": cid, "finished": {"$ne": None}, "inactive": {"$ne": True}}
|
||||
cursor = crawls.find(match_query).sort("finished", pymongo.DESCENDING)
|
||||
results = await cursor.to_list(length=10_000)
|
||||
if results:
|
||||
update_query["crawlCount"] = len(results)
|
||||
|
||||
last_crawl = results[0]
|
||||
update_query["lastCrawlId"] = str(last_crawl.get("_id"))
|
||||
update_query["lastCrawlStartTime"] = last_crawl.get("started")
|
||||
update_query["lastStartedBy"] = last_crawl.get("userid")
|
||||
update_query["lastCrawlTime"] = last_crawl.get("finished")
|
||||
update_query["lastCrawlState"] = last_crawl.get("state")
|
||||
update_query["lastCrawlSize"] = sum(
|
||||
file_.get("size", 0) for file_ in last_crawl.get("files", [])
|
||||
)
|
||||
|
||||
total_size = 0
|
||||
for res in results:
|
||||
files = res.get("files", [])
|
||||
for file in files:
|
||||
total_size += file.get("size", 0)
|
||||
update_query["totalSize"] = total_size
|
||||
|
||||
result = await crawl_configs.find_one_and_update(
|
||||
{"_id": cid, "inactive": {"$ne": True}},
|
||||
{"$set": update_query},
|
||||
return_document=pymongo.ReturnDocument.AFTER,
|
||||
)
|
||||
|
||||
if not result:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Crawl Config '{cid}' not found to update"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
|
||||
def init_crawl_config_api(
|
||||
|
@ -196,6 +196,10 @@ class CrawlOps:
|
||||
|
||||
self.presign_duration = int(os.environ.get("PRESIGN_DURATION_SECONDS", 3600))
|
||||
|
||||
async def init_index(self):
|
||||
"""init index for crawls db collection"""
|
||||
await self.crawls.create_index([("finished", pymongo.DESCENDING)])
|
||||
|
||||
async def list_crawls(
|
||||
self,
|
||||
org: Optional[Organization] = None,
|
||||
@ -380,45 +384,6 @@ class CrawlOps:
|
||||
|
||||
return await self._resolve_crawl_refs(crawl, org)
|
||||
|
||||
async def get_latest_crawl_and_count_by_config(self, cid: str):
|
||||
"""Get crawl statistics for a crawl_config with id cid."""
|
||||
stats = {
|
||||
"crawl_count": 0,
|
||||
"total_size": 0,
|
||||
"last_crawl_id": None,
|
||||
"last_crawl_started": None,
|
||||
"last_crawl_finished": None,
|
||||
"last_crawl_state": None,
|
||||
"last_started_by": None,
|
||||
"last_crawl_size": 0,
|
||||
}
|
||||
|
||||
match_query = {"cid": cid, "finished": {"$ne": None}, "inactive": {"$ne": True}}
|
||||
cursor = self.crawls.find(match_query).sort("finished", pymongo.DESCENDING)
|
||||
results = await cursor.to_list(length=1000)
|
||||
if results:
|
||||
stats["crawl_count"] = len(results)
|
||||
|
||||
last_crawl = Crawl.from_dict(results[0])
|
||||
stats["last_crawl_id"] = str(last_crawl.id)
|
||||
stats["last_crawl_started"] = last_crawl.started
|
||||
stats["last_crawl_finished"] = last_crawl.finished
|
||||
stats["last_crawl_state"] = last_crawl.state
|
||||
stats["last_crawl_size"] = sum(file_.size for file_ in last_crawl.files)
|
||||
|
||||
user = await self.user_manager.get(last_crawl.userid)
|
||||
if user:
|
||||
stats["last_started_by"] = user.name
|
||||
|
||||
total_size = 0
|
||||
for res in results:
|
||||
files = res["files"]
|
||||
for file in files:
|
||||
total_size += file["size"]
|
||||
stats["total_size"] = total_size
|
||||
|
||||
return stats
|
||||
|
||||
async def _resolve_crawl_refs(
|
||||
self,
|
||||
crawl: Union[CrawlOut, ListCrawlOut],
|
||||
@ -527,13 +492,22 @@ class CrawlOps:
|
||||
|
||||
async def delete_crawls(self, org: Organization, delete_list: DeleteCrawlList):
|
||||
"""Delete a list of crawls by id for given org"""
|
||||
cids_to_update = set()
|
||||
|
||||
for crawl_id in delete_list.crawl_ids:
|
||||
await self._delete_crawl_files(org, crawl_id)
|
||||
await self.remove_crawl_from_collections(org, crawl_id)
|
||||
|
||||
crawl = await self.get_crawl_raw(crawl_id, org)
|
||||
cids_to_update.add(crawl["cid"])
|
||||
|
||||
res = await self.crawls.delete_many(
|
||||
{"_id": {"$in": delete_list.crawl_ids}, "oid": org.id}
|
||||
)
|
||||
|
||||
for cid in cids_to_update:
|
||||
await self.crawl_configs.update_crawl_stats(cid)
|
||||
|
||||
return res.deleted_count
|
||||
|
||||
async def _delete_crawl_files(self, org: Organization, crawl_id: str):
|
||||
@ -837,7 +811,7 @@ class CrawlOps:
|
||||
|
||||
return resp
|
||||
|
||||
async def remove_crawl_from_collections(self, oid: uuid.UUID, crawl_id: str):
|
||||
async def remove_crawl_from_collections(self, org: Organization, crawl_id: str):
|
||||
"""Remove crawl with given crawl_id from all collections it belongs to"""
|
||||
collections = [
|
||||
coll["name"]
|
||||
@ -845,7 +819,7 @@ class CrawlOps:
|
||||
]
|
||||
for collection_name in collections:
|
||||
await self.collections.find_one_and_update(
|
||||
{"name": collection_name, "oid": oid},
|
||||
{"name": collection_name, "oid": org.id},
|
||||
{"$pull": {"crawlIds": crawl_id}},
|
||||
)
|
||||
|
||||
@ -884,7 +858,11 @@ async def add_new_crawl(
|
||||
# ============================================================================
|
||||
async def update_crawl(crawls, crawl_id, **kwargs):
|
||||
"""update crawl state in db"""
|
||||
await crawls.find_one_and_update({"_id": crawl_id}, {"$set": kwargs})
|
||||
return await crawls.find_one_and_update(
|
||||
{"_id": crawl_id},
|
||||
{"$set": kwargs},
|
||||
return_document=pymongo.ReturnDocument.AFTER,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -1056,8 +1034,6 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
|
||||
status_code=400, detail=f"Error Stopping Crawl: {exc}"
|
||||
)
|
||||
|
||||
await ops.remove_crawl_from_collections(crawl.oid, crawl.id)
|
||||
|
||||
res = await ops.delete_crawls(org, delete_list)
|
||||
|
||||
return {"deleted": res}
|
||||
|
@ -13,7 +13,7 @@ from pymongo.errors import InvalidName
|
||||
from .migrations import BaseMigration
|
||||
|
||||
|
||||
CURR_DB_VERSION = "0005"
|
||||
CURR_DB_VERSION = "0006"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -55,6 +55,7 @@ async def update_and_prepare_db(
|
||||
mdb,
|
||||
user_manager,
|
||||
org_ops,
|
||||
crawl_ops,
|
||||
crawl_config_ops,
|
||||
coll_ops,
|
||||
invite_ops,
|
||||
@ -70,7 +71,7 @@ async def update_and_prepare_db(
|
||||
print("Database setup started", flush=True)
|
||||
if await run_db_migrations(mdb, user_manager):
|
||||
await drop_indexes(mdb)
|
||||
await create_indexes(org_ops, crawl_config_ops, coll_ops, invite_ops)
|
||||
await create_indexes(org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops)
|
||||
await user_manager.create_super_user()
|
||||
await org_ops.create_default_org()
|
||||
print("Database updated and ready", flush=True)
|
||||
@ -134,10 +135,11 @@ async def drop_indexes(mdb):
|
||||
|
||||
|
||||
# ============================================================================
|
||||
async def create_indexes(org_ops, crawl_config_ops, coll_ops, invite_ops):
|
||||
async def create_indexes(org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops):
|
||||
"""Create database indexes."""
|
||||
print("Creating database indexes", flush=True)
|
||||
await org_ops.init_index()
|
||||
await crawl_ops.init_index()
|
||||
await crawl_config_ops.init_index()
|
||||
await coll_ops.init_index()
|
||||
await invite_ops.init_index()
|
||||
|
@ -114,7 +114,7 @@ def main():
|
||||
if run_once_lock("btrix-init-db"):
|
||||
asyncio.create_task(
|
||||
update_and_prepare_db(
|
||||
mdb, user_manager, org_ops, crawl_config_ops, coll_ops, invites
|
||||
mdb, user_manager, org_ops, crawls, crawl_config_ops, coll_ops, invites
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -0,0 +1,31 @@
|
||||
"""
|
||||
Migration 0006 - Precomputing workflow crawl stats
|
||||
"""
|
||||
from btrixcloud.crawlconfigs import update_config_crawl_stats
|
||||
from btrixcloud.migrations import BaseMigration
|
||||
|
||||
|
||||
MIGRATION_VERSION = "0006"
|
||||
|
||||
|
||||
class Migration(BaseMigration):
|
||||
"""Migration class."""
|
||||
|
||||
def __init__(self, mdb, migration_version=MIGRATION_VERSION):
|
||||
super().__init__(mdb, migration_version)
|
||||
|
||||
async def migrate_up(self):
|
||||
"""Perform migration up.
|
||||
|
||||
Add data on workflow crawl statistics that was previously dynamically
|
||||
computed when needed to the database.
|
||||
"""
|
||||
crawl_configs = self.mdb["crawl_configs"]
|
||||
crawls = self.mdb["crawls"]
|
||||
|
||||
configs = [res async for res in crawl_configs.find({})]
|
||||
if not configs:
|
||||
return
|
||||
|
||||
for config in configs:
|
||||
await update_config_crawl_stats(crawl_configs, crawls, config["_id"])
|
@ -19,6 +19,7 @@ from .k8sapi import K8sAPI
|
||||
|
||||
from .db import init_db
|
||||
from .orgs import inc_org_stats
|
||||
from .crawlconfigs import update_config_crawl_stats
|
||||
from .crawls import (
|
||||
CrawlFile,
|
||||
CrawlCompleteIn,
|
||||
@ -97,6 +98,7 @@ class BtrixOperator(K8sAPI):
|
||||
|
||||
_, mdb = init_db()
|
||||
self.crawls = mdb["crawls"]
|
||||
self.crawl_configs = mdb["crawl_configs"]
|
||||
self.orgs = mdb["organizations"]
|
||||
|
||||
self.done_key = "crawls-done"
|
||||
@ -446,7 +448,10 @@ class BtrixOperator(K8sAPI):
|
||||
if stats:
|
||||
kwargs["stats"] = stats
|
||||
|
||||
await update_crawl(self.crawls, crawl_id, **kwargs)
|
||||
crawl = await update_crawl(self.crawls, crawl_id, **kwargs)
|
||||
crawl_cid = crawl.get("cid")
|
||||
|
||||
await update_config_crawl_stats(self.crawl_configs, self.crawls, crawl_cid)
|
||||
|
||||
if redis:
|
||||
await self.add_crawl_errors_to_db(redis, crawl_id)
|
||||
|
@ -208,7 +208,7 @@ def test_verify_revs_history(crawler_auth_headers, default_org_id):
|
||||
assert sorted_data[0]["config"]["scopeType"] == "prefix"
|
||||
|
||||
|
||||
def test_workflow_total_size(
|
||||
def test_workflow_total_size_and_last_crawl_stats(
|
||||
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
|
||||
):
|
||||
admin_crawl_cid = ""
|
||||
@ -225,8 +225,18 @@ def test_workflow_total_size(
|
||||
last_crawl_id = workflow.get("lastCrawlId")
|
||||
if last_crawl_id and last_crawl_id in (admin_crawl_id, crawler_crawl_id):
|
||||
assert workflow["totalSize"] > 0
|
||||
assert workflow["crawlCount"] > 0
|
||||
|
||||
assert workflow["lastCrawlId"]
|
||||
assert workflow["lastCrawlStartTime"]
|
||||
assert workflow["lastStartedByName"]
|
||||
assert workflow["lastCrawlTime"]
|
||||
assert workflow["lastCrawlState"]
|
||||
assert workflow["lastCrawlSize"] > 0
|
||||
|
||||
if last_crawl_id == admin_crawl_id:
|
||||
admin_crawl_cid = workflow["id"]
|
||||
assert admin_crawl_cid
|
||||
else:
|
||||
assert workflow["totalSize"] == 0
|
||||
|
||||
@ -237,3 +247,11 @@ def test_workflow_total_size(
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["totalSize"] > 0
|
||||
assert data["crawlCount"] > 0
|
||||
|
||||
assert data["lastCrawlId"]
|
||||
assert data["lastCrawlStartTime"]
|
||||
assert data["lastStartedByName"]
|
||||
assert data["lastCrawlTime"]
|
||||
assert data["lastCrawlState"]
|
||||
assert data["lastCrawlSize"] > 0
|
||||
|
Loading…
Reference in New Issue
Block a user