Pre-compute workflow last crawl information (#812)
* Precompute config crawl stats * Includes a database migration to move preciously dynamically computed crawl stats for workflows into the CrawlConfig model. * Add crawls.finished descending index * Add last crawl fields to workflow tests
This commit is contained in:
parent
9fcbc3f87e
commit
8281ba723e
@ -178,6 +178,16 @@ class CrawlConfig(CrawlConfigCore):
|
|||||||
|
|
||||||
rev: int = 0
|
rev: int = 0
|
||||||
|
|
||||||
|
crawlCount: Optional[int] = 0
|
||||||
|
totalSize: Optional[int] = 0
|
||||||
|
|
||||||
|
lastCrawlId: Optional[str]
|
||||||
|
lastCrawlStartTime: Optional[datetime]
|
||||||
|
lastStartedBy: Optional[UUID4]
|
||||||
|
lastCrawlTime: Optional[datetime]
|
||||||
|
lastCrawlState: Optional[str]
|
||||||
|
lastCrawlSize: Optional[int]
|
||||||
|
|
||||||
def get_raw_config(self):
|
def get_raw_config(self):
|
||||||
"""serialize config for browsertrix-crawler"""
|
"""serialize config for browsertrix-crawler"""
|
||||||
return self.config.dict(exclude_unset=True, exclude_none=True)
|
return self.config.dict(exclude_unset=True, exclude_none=True)
|
||||||
@ -200,15 +210,6 @@ class CrawlConfigOut(CrawlConfig):
|
|||||||
|
|
||||||
firstSeed: Optional[str]
|
firstSeed: Optional[str]
|
||||||
|
|
||||||
totalSize: Optional[int] = 0
|
|
||||||
|
|
||||||
crawlCount: Optional[int] = 0
|
|
||||||
lastCrawlId: Optional[str]
|
|
||||||
lastCrawlStartTime: Optional[datetime]
|
|
||||||
lastCrawlTime: Optional[datetime]
|
|
||||||
lastCrawlState: Optional[str]
|
|
||||||
lastCrawlSize: Optional[int]
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class CrawlConfigIdNameOut(BaseMongoModel):
|
class CrawlConfigIdNameOut(BaseMongoModel):
|
||||||
@ -241,6 +242,7 @@ class CrawlConfigOps:
|
|||||||
|
|
||||||
def __init__(self, dbclient, mdb, user_manager, org_ops, crawl_manager, profiles):
|
def __init__(self, dbclient, mdb, user_manager, org_ops, crawl_manager, profiles):
|
||||||
self.dbclient = dbclient
|
self.dbclient = dbclient
|
||||||
|
self.crawls = mdb["crawls"]
|
||||||
self.crawl_configs = mdb["crawl_configs"]
|
self.crawl_configs = mdb["crawl_configs"]
|
||||||
self.config_revs = mdb["configs_revs"]
|
self.config_revs = mdb["configs_revs"]
|
||||||
self.user_manager = user_manager
|
self.user_manager = user_manager
|
||||||
@ -264,7 +266,7 @@ class CrawlConfigOps:
|
|||||||
self.crawl_ops = ops
|
self.crawl_ops = ops
|
||||||
|
|
||||||
async def init_index(self):
|
async def init_index(self):
|
||||||
"""init index for crawls db"""
|
"""init index for crawlconfigs db collection"""
|
||||||
await self.crawl_configs.create_index(
|
await self.crawl_configs.create_index(
|
||||||
[("oid", pymongo.HASHED), ("inactive", pymongo.ASCENDING)]
|
[("oid", pymongo.HASHED), ("inactive", pymongo.ASCENDING)]
|
||||||
)
|
)
|
||||||
@ -509,84 +511,6 @@ class CrawlConfigOps:
|
|||||||
# Set firstSeed
|
# Set firstSeed
|
||||||
{"$set": {"firstSeed": "$firstSeedObject.url"}},
|
{"$set": {"firstSeed": "$firstSeedObject.url"}},
|
||||||
{"$unset": ["firstSeedObject"]},
|
{"$unset": ["firstSeedObject"]},
|
||||||
{
|
|
||||||
"$lookup": {
|
|
||||||
"from": "crawls",
|
|
||||||
"localField": "_id",
|
|
||||||
"foreignField": "cid",
|
|
||||||
"as": "configCrawls",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
# Filter workflow crawls on finished and active
|
|
||||||
{
|
|
||||||
"$set": {
|
|
||||||
"finishedCrawls": {
|
|
||||||
"$filter": {
|
|
||||||
"input": "$configCrawls",
|
|
||||||
"as": "filterCrawls",
|
|
||||||
"cond": {
|
|
||||||
"$and": [
|
|
||||||
{"$ne": ["$$filterCrawls.finished", None]},
|
|
||||||
{"$ne": ["$$filterCrawls.inactive", True]},
|
|
||||||
]
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
# Set crawl count to number of finished crawls
|
|
||||||
{"$set": {"crawlCount": {"$size": "$finishedCrawls"}}},
|
|
||||||
# Sort finished crawls by finished time descending to get latest
|
|
||||||
{
|
|
||||||
"$set": {
|
|
||||||
"sortedCrawls": {
|
|
||||||
"$sortArray": {
|
|
||||||
"input": "$finishedCrawls",
|
|
||||||
"sortBy": {"finished": -1},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{"$unset": ["finishedCrawls"]},
|
|
||||||
{"$set": {"lastCrawl": {"$arrayElemAt": ["$sortedCrawls", 0]}}},
|
|
||||||
{"$set": {"lastCrawlId": "$lastCrawl._id"}},
|
|
||||||
{"$set": {"lastCrawlStartTime": "$lastCrawl.started"}},
|
|
||||||
{"$set": {"lastCrawlTime": "$lastCrawl.finished"}},
|
|
||||||
{"$set": {"lastCrawlState": "$lastCrawl.state"}},
|
|
||||||
# Get userid of last started crawl
|
|
||||||
{"$set": {"lastStartedBy": "$lastCrawl.userid"}},
|
|
||||||
{"$set": {"lastCrawlSize": {"$sum": "$lastCrawl.files.size"}}},
|
|
||||||
{
|
|
||||||
"$lookup": {
|
|
||||||
"from": "users",
|
|
||||||
"localField": "lastStartedBy",
|
|
||||||
"foreignField": "id",
|
|
||||||
"as": "lastStartedByName",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"$set": {
|
|
||||||
"lastStartedByName": {
|
|
||||||
"$arrayElemAt": ["$lastStartedByName.name", 0]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"$set": {
|
|
||||||
"totalSize": {
|
|
||||||
"$sum": {
|
|
||||||
"$map": {
|
|
||||||
"input": "$sortedCrawls.files",
|
|
||||||
"as": "crawlFile",
|
|
||||||
"in": {"$arrayElemAt": ["$$crawlFile.size", 0]},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
# unset
|
|
||||||
{"$unset": ["lastCrawl"]},
|
|
||||||
{"$unset": ["sortedCrawls"]},
|
|
||||||
]
|
]
|
||||||
|
|
||||||
if first_seed:
|
if first_seed:
|
||||||
@ -611,6 +535,19 @@ class CrawlConfigOps:
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
{"$set": {"createdByName": {"$arrayElemAt": ["$userName.name", 0]}}},
|
{"$set": {"createdByName": {"$arrayElemAt": ["$userName.name", 0]}}},
|
||||||
|
{
|
||||||
|
"$lookup": {
|
||||||
|
"from": "users",
|
||||||
|
"localField": "lastStartedBy",
|
||||||
|
"foreignField": "id",
|
||||||
|
"as": "startedName",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$set": {
|
||||||
|
"lastStartedByName": {"$arrayElemAt": ["$startedName.name", 0]}
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"$lookup": {
|
"$lookup": {
|
||||||
"from": "users",
|
"from": "users",
|
||||||
@ -648,22 +585,12 @@ class CrawlConfigOps:
|
|||||||
except (IndexError, ValueError):
|
except (IndexError, ValueError):
|
||||||
total = 0
|
total = 0
|
||||||
|
|
||||||
# crawls = await self.crawl_manager.list_running_crawls(oid=org.id)
|
|
||||||
crawls, _ = await self.crawl_ops.list_crawls(
|
|
||||||
org=org,
|
|
||||||
running_only=True,
|
|
||||||
# Set high so that when we lower default we still get all running crawls
|
|
||||||
page_size=1_000,
|
|
||||||
)
|
|
||||||
running = {}
|
|
||||||
for crawl in crawls:
|
|
||||||
running[crawl.cid] = crawl
|
|
||||||
|
|
||||||
configs = []
|
configs = []
|
||||||
for res in items:
|
for res in items:
|
||||||
config = CrawlConfigOut.from_dict(res)
|
config = CrawlConfigOut.from_dict(res)
|
||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
self._add_curr_crawl_stats(config, running.get(config.id))
|
if not config.inactive:
|
||||||
|
self._add_curr_crawl_stats(config, await self.get_running_crawl(config))
|
||||||
configs.append(config)
|
configs.append(config)
|
||||||
|
|
||||||
return configs, total
|
return configs, total
|
||||||
@ -693,20 +620,9 @@ class CrawlConfigOps:
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def _annotate_with_crawl_stats(self, crawlconfig: CrawlConfigOut):
|
async def update_crawl_stats(self, cid: uuid.UUID):
|
||||||
"""Annotate crawlconfig with information about associated crawls"""
|
"""Update crawl count, total size, and last crawl information for config."""
|
||||||
crawl_stats = await self.crawl_ops.get_latest_crawl_and_count_by_config(
|
await update_config_crawl_stats(self.crawl_configs, self.crawls, cid)
|
||||||
cid=crawlconfig.id
|
|
||||||
)
|
|
||||||
crawlconfig.crawlCount = crawl_stats["crawl_count"]
|
|
||||||
crawlconfig.totalSize = crawl_stats["total_size"]
|
|
||||||
crawlconfig.lastCrawlId = crawl_stats["last_crawl_id"]
|
|
||||||
crawlconfig.lastCrawlStartTime = crawl_stats["last_crawl_started"]
|
|
||||||
crawlconfig.lastCrawlTime = crawl_stats["last_crawl_finished"]
|
|
||||||
crawlconfig.lastStartedByName = crawl_stats["last_started_by"]
|
|
||||||
crawlconfig.lastCrawlState = crawl_stats["last_crawl_state"]
|
|
||||||
crawlconfig.lastCrawlSize = crawl_stats["last_crawl_size"]
|
|
||||||
return crawlconfig
|
|
||||||
|
|
||||||
def _add_curr_crawl_stats(self, crawlconfig, crawl):
|
def _add_curr_crawl_stats(self, crawlconfig, crawl):
|
||||||
"""Add stats from current running crawl, if any"""
|
"""Add stats from current running crawl, if any"""
|
||||||
@ -745,13 +661,17 @@ class CrawlConfigOps:
|
|||||||
if modified_user:
|
if modified_user:
|
||||||
crawlconfig.modifiedByName = modified_user.name
|
crawlconfig.modifiedByName = modified_user.name
|
||||||
|
|
||||||
|
if crawlconfig.lastStartedBy:
|
||||||
|
last_started_user = await self.user_manager.get(crawlconfig.lastStartedBy)
|
||||||
|
# pylint: disable=invalid-name
|
||||||
|
if last_started_user:
|
||||||
|
crawlconfig.lastStartedByName = last_started_user.name
|
||||||
|
|
||||||
if crawlconfig.profileid:
|
if crawlconfig.profileid:
|
||||||
crawlconfig.profileName = await self.profiles.get_profile_name(
|
crawlconfig.profileName = await self.profiles.get_profile_name(
|
||||||
crawlconfig.profileid, org
|
crawlconfig.profileid, org
|
||||||
)
|
)
|
||||||
|
|
||||||
crawlconfig = await self._annotate_with_crawl_stats(crawlconfig)
|
|
||||||
|
|
||||||
return crawlconfig
|
return crawlconfig
|
||||||
|
|
||||||
async def get_crawl_config(
|
async def get_crawl_config(
|
||||||
@ -957,6 +877,58 @@ async def inc_crawl_count(crawl_configs, cid: uuid.UUID):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# pylint: disable=too-many-locals
|
||||||
|
async def update_config_crawl_stats(crawl_configs, crawls, cid: uuid.UUID):
|
||||||
|
"""re-calculate and update crawl statistics for config"""
|
||||||
|
update_query = {
|
||||||
|
"crawlCount": 0,
|
||||||
|
"totalSize": 0,
|
||||||
|
"lastCrawlId": None,
|
||||||
|
"lastCrawlStartTime": None,
|
||||||
|
"lastStartedBy": None,
|
||||||
|
"lastCrawlTime": None,
|
||||||
|
"lastCrawlState": None,
|
||||||
|
"lastCrawlSize": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
match_query = {"cid": cid, "finished": {"$ne": None}, "inactive": {"$ne": True}}
|
||||||
|
cursor = crawls.find(match_query).sort("finished", pymongo.DESCENDING)
|
||||||
|
results = await cursor.to_list(length=10_000)
|
||||||
|
if results:
|
||||||
|
update_query["crawlCount"] = len(results)
|
||||||
|
|
||||||
|
last_crawl = results[0]
|
||||||
|
update_query["lastCrawlId"] = str(last_crawl.get("_id"))
|
||||||
|
update_query["lastCrawlStartTime"] = last_crawl.get("started")
|
||||||
|
update_query["lastStartedBy"] = last_crawl.get("userid")
|
||||||
|
update_query["lastCrawlTime"] = last_crawl.get("finished")
|
||||||
|
update_query["lastCrawlState"] = last_crawl.get("state")
|
||||||
|
update_query["lastCrawlSize"] = sum(
|
||||||
|
file_.get("size", 0) for file_ in last_crawl.get("files", [])
|
||||||
|
)
|
||||||
|
|
||||||
|
total_size = 0
|
||||||
|
for res in results:
|
||||||
|
files = res.get("files", [])
|
||||||
|
for file in files:
|
||||||
|
total_size += file.get("size", 0)
|
||||||
|
update_query["totalSize"] = total_size
|
||||||
|
|
||||||
|
result = await crawl_configs.find_one_and_update(
|
||||||
|
{"_id": cid, "inactive": {"$ne": True}},
|
||||||
|
{"$set": update_query},
|
||||||
|
return_document=pymongo.ReturnDocument.AFTER,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not result:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404, detail=f"Crawl Config '{cid}' not found to update"
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
|
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
|
||||||
def init_crawl_config_api(
|
def init_crawl_config_api(
|
||||||
|
@ -196,6 +196,10 @@ class CrawlOps:
|
|||||||
|
|
||||||
self.presign_duration = int(os.environ.get("PRESIGN_DURATION_SECONDS", 3600))
|
self.presign_duration = int(os.environ.get("PRESIGN_DURATION_SECONDS", 3600))
|
||||||
|
|
||||||
|
async def init_index(self):
|
||||||
|
"""init index for crawls db collection"""
|
||||||
|
await self.crawls.create_index([("finished", pymongo.DESCENDING)])
|
||||||
|
|
||||||
async def list_crawls(
|
async def list_crawls(
|
||||||
self,
|
self,
|
||||||
org: Optional[Organization] = None,
|
org: Optional[Organization] = None,
|
||||||
@ -380,45 +384,6 @@ class CrawlOps:
|
|||||||
|
|
||||||
return await self._resolve_crawl_refs(crawl, org)
|
return await self._resolve_crawl_refs(crawl, org)
|
||||||
|
|
||||||
async def get_latest_crawl_and_count_by_config(self, cid: str):
|
|
||||||
"""Get crawl statistics for a crawl_config with id cid."""
|
|
||||||
stats = {
|
|
||||||
"crawl_count": 0,
|
|
||||||
"total_size": 0,
|
|
||||||
"last_crawl_id": None,
|
|
||||||
"last_crawl_started": None,
|
|
||||||
"last_crawl_finished": None,
|
|
||||||
"last_crawl_state": None,
|
|
||||||
"last_started_by": None,
|
|
||||||
"last_crawl_size": 0,
|
|
||||||
}
|
|
||||||
|
|
||||||
match_query = {"cid": cid, "finished": {"$ne": None}, "inactive": {"$ne": True}}
|
|
||||||
cursor = self.crawls.find(match_query).sort("finished", pymongo.DESCENDING)
|
|
||||||
results = await cursor.to_list(length=1000)
|
|
||||||
if results:
|
|
||||||
stats["crawl_count"] = len(results)
|
|
||||||
|
|
||||||
last_crawl = Crawl.from_dict(results[0])
|
|
||||||
stats["last_crawl_id"] = str(last_crawl.id)
|
|
||||||
stats["last_crawl_started"] = last_crawl.started
|
|
||||||
stats["last_crawl_finished"] = last_crawl.finished
|
|
||||||
stats["last_crawl_state"] = last_crawl.state
|
|
||||||
stats["last_crawl_size"] = sum(file_.size for file_ in last_crawl.files)
|
|
||||||
|
|
||||||
user = await self.user_manager.get(last_crawl.userid)
|
|
||||||
if user:
|
|
||||||
stats["last_started_by"] = user.name
|
|
||||||
|
|
||||||
total_size = 0
|
|
||||||
for res in results:
|
|
||||||
files = res["files"]
|
|
||||||
for file in files:
|
|
||||||
total_size += file["size"]
|
|
||||||
stats["total_size"] = total_size
|
|
||||||
|
|
||||||
return stats
|
|
||||||
|
|
||||||
async def _resolve_crawl_refs(
|
async def _resolve_crawl_refs(
|
||||||
self,
|
self,
|
||||||
crawl: Union[CrawlOut, ListCrawlOut],
|
crawl: Union[CrawlOut, ListCrawlOut],
|
||||||
@ -527,13 +492,22 @@ class CrawlOps:
|
|||||||
|
|
||||||
async def delete_crawls(self, org: Organization, delete_list: DeleteCrawlList):
|
async def delete_crawls(self, org: Organization, delete_list: DeleteCrawlList):
|
||||||
"""Delete a list of crawls by id for given org"""
|
"""Delete a list of crawls by id for given org"""
|
||||||
|
cids_to_update = set()
|
||||||
|
|
||||||
for crawl_id in delete_list.crawl_ids:
|
for crawl_id in delete_list.crawl_ids:
|
||||||
await self._delete_crawl_files(org, crawl_id)
|
await self._delete_crawl_files(org, crawl_id)
|
||||||
|
await self.remove_crawl_from_collections(org, crawl_id)
|
||||||
|
|
||||||
|
crawl = await self.get_crawl_raw(crawl_id, org)
|
||||||
|
cids_to_update.add(crawl["cid"])
|
||||||
|
|
||||||
res = await self.crawls.delete_many(
|
res = await self.crawls.delete_many(
|
||||||
{"_id": {"$in": delete_list.crawl_ids}, "oid": org.id}
|
{"_id": {"$in": delete_list.crawl_ids}, "oid": org.id}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
for cid in cids_to_update:
|
||||||
|
await self.crawl_configs.update_crawl_stats(cid)
|
||||||
|
|
||||||
return res.deleted_count
|
return res.deleted_count
|
||||||
|
|
||||||
async def _delete_crawl_files(self, org: Organization, crawl_id: str):
|
async def _delete_crawl_files(self, org: Organization, crawl_id: str):
|
||||||
@ -837,7 +811,7 @@ class CrawlOps:
|
|||||||
|
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
async def remove_crawl_from_collections(self, oid: uuid.UUID, crawl_id: str):
|
async def remove_crawl_from_collections(self, org: Organization, crawl_id: str):
|
||||||
"""Remove crawl with given crawl_id from all collections it belongs to"""
|
"""Remove crawl with given crawl_id from all collections it belongs to"""
|
||||||
collections = [
|
collections = [
|
||||||
coll["name"]
|
coll["name"]
|
||||||
@ -845,7 +819,7 @@ class CrawlOps:
|
|||||||
]
|
]
|
||||||
for collection_name in collections:
|
for collection_name in collections:
|
||||||
await self.collections.find_one_and_update(
|
await self.collections.find_one_and_update(
|
||||||
{"name": collection_name, "oid": oid},
|
{"name": collection_name, "oid": org.id},
|
||||||
{"$pull": {"crawlIds": crawl_id}},
|
{"$pull": {"crawlIds": crawl_id}},
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -884,7 +858,11 @@ async def add_new_crawl(
|
|||||||
# ============================================================================
|
# ============================================================================
|
||||||
async def update_crawl(crawls, crawl_id, **kwargs):
|
async def update_crawl(crawls, crawl_id, **kwargs):
|
||||||
"""update crawl state in db"""
|
"""update crawl state in db"""
|
||||||
await crawls.find_one_and_update({"_id": crawl_id}, {"$set": kwargs})
|
return await crawls.find_one_and_update(
|
||||||
|
{"_id": crawl_id},
|
||||||
|
{"$set": kwargs},
|
||||||
|
return_document=pymongo.ReturnDocument.AFTER,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -1056,8 +1034,6 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
|
|||||||
status_code=400, detail=f"Error Stopping Crawl: {exc}"
|
status_code=400, detail=f"Error Stopping Crawl: {exc}"
|
||||||
)
|
)
|
||||||
|
|
||||||
await ops.remove_crawl_from_collections(crawl.oid, crawl.id)
|
|
||||||
|
|
||||||
res = await ops.delete_crawls(org, delete_list)
|
res = await ops.delete_crawls(org, delete_list)
|
||||||
|
|
||||||
return {"deleted": res}
|
return {"deleted": res}
|
||||||
|
@ -13,7 +13,7 @@ from pymongo.errors import InvalidName
|
|||||||
from .migrations import BaseMigration
|
from .migrations import BaseMigration
|
||||||
|
|
||||||
|
|
||||||
CURR_DB_VERSION = "0005"
|
CURR_DB_VERSION = "0006"
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -55,6 +55,7 @@ async def update_and_prepare_db(
|
|||||||
mdb,
|
mdb,
|
||||||
user_manager,
|
user_manager,
|
||||||
org_ops,
|
org_ops,
|
||||||
|
crawl_ops,
|
||||||
crawl_config_ops,
|
crawl_config_ops,
|
||||||
coll_ops,
|
coll_ops,
|
||||||
invite_ops,
|
invite_ops,
|
||||||
@ -70,7 +71,7 @@ async def update_and_prepare_db(
|
|||||||
print("Database setup started", flush=True)
|
print("Database setup started", flush=True)
|
||||||
if await run_db_migrations(mdb, user_manager):
|
if await run_db_migrations(mdb, user_manager):
|
||||||
await drop_indexes(mdb)
|
await drop_indexes(mdb)
|
||||||
await create_indexes(org_ops, crawl_config_ops, coll_ops, invite_ops)
|
await create_indexes(org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops)
|
||||||
await user_manager.create_super_user()
|
await user_manager.create_super_user()
|
||||||
await org_ops.create_default_org()
|
await org_ops.create_default_org()
|
||||||
print("Database updated and ready", flush=True)
|
print("Database updated and ready", flush=True)
|
||||||
@ -134,10 +135,11 @@ async def drop_indexes(mdb):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
async def create_indexes(org_ops, crawl_config_ops, coll_ops, invite_ops):
|
async def create_indexes(org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops):
|
||||||
"""Create database indexes."""
|
"""Create database indexes."""
|
||||||
print("Creating database indexes", flush=True)
|
print("Creating database indexes", flush=True)
|
||||||
await org_ops.init_index()
|
await org_ops.init_index()
|
||||||
|
await crawl_ops.init_index()
|
||||||
await crawl_config_ops.init_index()
|
await crawl_config_ops.init_index()
|
||||||
await coll_ops.init_index()
|
await coll_ops.init_index()
|
||||||
await invite_ops.init_index()
|
await invite_ops.init_index()
|
||||||
|
@ -114,7 +114,7 @@ def main():
|
|||||||
if run_once_lock("btrix-init-db"):
|
if run_once_lock("btrix-init-db"):
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
update_and_prepare_db(
|
update_and_prepare_db(
|
||||||
mdb, user_manager, org_ops, crawl_config_ops, coll_ops, invites
|
mdb, user_manager, org_ops, crawls, crawl_config_ops, coll_ops, invites
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -0,0 +1,31 @@
|
|||||||
|
"""
|
||||||
|
Migration 0006 - Precomputing workflow crawl stats
|
||||||
|
"""
|
||||||
|
from btrixcloud.crawlconfigs import update_config_crawl_stats
|
||||||
|
from btrixcloud.migrations import BaseMigration
|
||||||
|
|
||||||
|
|
||||||
|
MIGRATION_VERSION = "0006"
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(BaseMigration):
|
||||||
|
"""Migration class."""
|
||||||
|
|
||||||
|
def __init__(self, mdb, migration_version=MIGRATION_VERSION):
|
||||||
|
super().__init__(mdb, migration_version)
|
||||||
|
|
||||||
|
async def migrate_up(self):
|
||||||
|
"""Perform migration up.
|
||||||
|
|
||||||
|
Add data on workflow crawl statistics that was previously dynamically
|
||||||
|
computed when needed to the database.
|
||||||
|
"""
|
||||||
|
crawl_configs = self.mdb["crawl_configs"]
|
||||||
|
crawls = self.mdb["crawls"]
|
||||||
|
|
||||||
|
configs = [res async for res in crawl_configs.find({})]
|
||||||
|
if not configs:
|
||||||
|
return
|
||||||
|
|
||||||
|
for config in configs:
|
||||||
|
await update_config_crawl_stats(crawl_configs, crawls, config["_id"])
|
@ -19,6 +19,7 @@ from .k8sapi import K8sAPI
|
|||||||
|
|
||||||
from .db import init_db
|
from .db import init_db
|
||||||
from .orgs import inc_org_stats
|
from .orgs import inc_org_stats
|
||||||
|
from .crawlconfigs import update_config_crawl_stats
|
||||||
from .crawls import (
|
from .crawls import (
|
||||||
CrawlFile,
|
CrawlFile,
|
||||||
CrawlCompleteIn,
|
CrawlCompleteIn,
|
||||||
@ -97,6 +98,7 @@ class BtrixOperator(K8sAPI):
|
|||||||
|
|
||||||
_, mdb = init_db()
|
_, mdb = init_db()
|
||||||
self.crawls = mdb["crawls"]
|
self.crawls = mdb["crawls"]
|
||||||
|
self.crawl_configs = mdb["crawl_configs"]
|
||||||
self.orgs = mdb["organizations"]
|
self.orgs = mdb["organizations"]
|
||||||
|
|
||||||
self.done_key = "crawls-done"
|
self.done_key = "crawls-done"
|
||||||
@ -446,7 +448,10 @@ class BtrixOperator(K8sAPI):
|
|||||||
if stats:
|
if stats:
|
||||||
kwargs["stats"] = stats
|
kwargs["stats"] = stats
|
||||||
|
|
||||||
await update_crawl(self.crawls, crawl_id, **kwargs)
|
crawl = await update_crawl(self.crawls, crawl_id, **kwargs)
|
||||||
|
crawl_cid = crawl.get("cid")
|
||||||
|
|
||||||
|
await update_config_crawl_stats(self.crawl_configs, self.crawls, crawl_cid)
|
||||||
|
|
||||||
if redis:
|
if redis:
|
||||||
await self.add_crawl_errors_to_db(redis, crawl_id)
|
await self.add_crawl_errors_to_db(redis, crawl_id)
|
||||||
|
@ -208,7 +208,7 @@ def test_verify_revs_history(crawler_auth_headers, default_org_id):
|
|||||||
assert sorted_data[0]["config"]["scopeType"] == "prefix"
|
assert sorted_data[0]["config"]["scopeType"] == "prefix"
|
||||||
|
|
||||||
|
|
||||||
def test_workflow_total_size(
|
def test_workflow_total_size_and_last_crawl_stats(
|
||||||
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
|
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
|
||||||
):
|
):
|
||||||
admin_crawl_cid = ""
|
admin_crawl_cid = ""
|
||||||
@ -225,8 +225,18 @@ def test_workflow_total_size(
|
|||||||
last_crawl_id = workflow.get("lastCrawlId")
|
last_crawl_id = workflow.get("lastCrawlId")
|
||||||
if last_crawl_id and last_crawl_id in (admin_crawl_id, crawler_crawl_id):
|
if last_crawl_id and last_crawl_id in (admin_crawl_id, crawler_crawl_id):
|
||||||
assert workflow["totalSize"] > 0
|
assert workflow["totalSize"] > 0
|
||||||
|
assert workflow["crawlCount"] > 0
|
||||||
|
|
||||||
|
assert workflow["lastCrawlId"]
|
||||||
|
assert workflow["lastCrawlStartTime"]
|
||||||
|
assert workflow["lastStartedByName"]
|
||||||
|
assert workflow["lastCrawlTime"]
|
||||||
|
assert workflow["lastCrawlState"]
|
||||||
|
assert workflow["lastCrawlSize"] > 0
|
||||||
|
|
||||||
if last_crawl_id == admin_crawl_id:
|
if last_crawl_id == admin_crawl_id:
|
||||||
admin_crawl_cid = workflow["id"]
|
admin_crawl_cid = workflow["id"]
|
||||||
|
assert admin_crawl_cid
|
||||||
else:
|
else:
|
||||||
assert workflow["totalSize"] == 0
|
assert workflow["totalSize"] == 0
|
||||||
|
|
||||||
@ -237,3 +247,11 @@ def test_workflow_total_size(
|
|||||||
assert r.status_code == 200
|
assert r.status_code == 200
|
||||||
data = r.json()
|
data = r.json()
|
||||||
assert data["totalSize"] > 0
|
assert data["totalSize"] > 0
|
||||||
|
assert data["crawlCount"] > 0
|
||||||
|
|
||||||
|
assert data["lastCrawlId"]
|
||||||
|
assert data["lastCrawlStartTime"]
|
||||||
|
assert data["lastStartedByName"]
|
||||||
|
assert data["lastCrawlTime"]
|
||||||
|
assert data["lastCrawlState"]
|
||||||
|
assert data["lastCrawlSize"] > 0
|
||||||
|
Loading…
Reference in New Issue
Block a user