CrawlConfig migration and crawl stats query optimization (#633)

* Drop crawl stats fields from CrawlConfig and add migration

* Remove migrate_down from BaseMigration

* Get crawl stats from optimized mongo query
This commit is contained in:
Tessa Walsh 2023-02-24 21:01:15 -05:00 committed by GitHub
parent 1dea7ecdf9
commit e2f359c352
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 136 additions and 83 deletions

View File

@ -139,15 +139,6 @@ class CrawlConfig(BaseMongoModel):
crawlAttemptCount: Optional[int] = 0
# These fields would ideally be in CrawlConfigOut, but are being
# kept here to prevent the need for a migration. Eventually, we
# may want to add a migration and move them, as these values are
# now generated dynamically in API endpoints as needed.
crawlCount: Optional[int] = 0
lastCrawlId: Optional[str]
lastCrawlTime: Optional[datetime]
lastCrawlState: Optional[str]
newId: Optional[UUID4]
oldId: Optional[UUID4]
inactive: Optional[bool] = False
@ -165,6 +156,11 @@ class CrawlConfigOut(CrawlConfig):
profileName: Optional[str]
userName: Optional[str]
crawlCount: Optional[int] = 0
lastCrawlId: Optional[str]
lastCrawlTime: Optional[datetime]
lastCrawlState: Optional[str]
# ============================================================================
class CrawlConfigIdNameOut(BaseMongoModel):
@ -436,21 +432,13 @@ class CrawlConfigOps:
async def _annotate_with_crawl_stats(self, crawlconfig: CrawlConfigOut):
"""Annotate crawlconfig with information about associated crawls"""
crawls = await self.crawl_ops.list_crawls(cid=crawlconfig.id)
crawlconfig.crawlCount = len(crawls)
finished_crawls = [crawl for crawl in crawls if crawl.finished]
if not finished_crawls:
return crawlconfig
sorted_crawls = sorted(finished_crawls, key=lambda crawl: crawl.finished)
last_crawl = sorted_crawls[-1]
crawlconfig.lastCrawlId = str(last_crawl.id)
crawlconfig.lastCrawlTime = last_crawl.finished
crawlconfig.lastCrawlState = last_crawl.state
crawl_stats = await self.crawl_ops.get_latest_crawl_and_count_by_config(
cid=crawlconfig.id
)
crawlconfig.crawlCount = crawl_stats["crawl_count"]
crawlconfig.lastCrawlId = crawl_stats["last_crawl_id"]
crawlconfig.lastCrawlTime = crawl_stats["last_crawl_finished"]
crawlconfig.lastCrawlState = crawl_stats["last_crawl_state"]
return crawlconfig
async def get_crawl_config_out(self, cid: uuid.UUID, org: Organization):

View File

@ -291,6 +291,28 @@ class CrawlOps:
return await self._resolve_crawl_refs(crawl, org)
async def get_latest_crawl_and_count_by_config(self, cid: str):
"""Get crawl statistics for a crawl_config with id cid."""
stats = {
"crawl_count": 0,
"last_crawl_id": None,
"last_crawl_finished": None,
"last_crawl_state": None,
}
match_query = {"cid": cid, "finished": {"$ne": None}, "inactive": {"$ne": True}}
cursor = self.crawls.find(match_query).sort("finished", pymongo.DESCENDING)
results = await cursor.to_list(length=1000)
if results:
stats["crawl_count"] = len(results)
last_crawl = Crawl.from_dict(results[0])
stats["last_crawl_id"] = str(last_crawl.id)
stats["last_crawl_finished"] = last_crawl.finished
stats["last_crawl_state"] = last_crawl.state
return stats
async def _resolve_crawl_refs(
self, crawl: Union[CrawlOut, ListCrawlOut], org: Optional[Organization]
):

View File

@ -0,0 +1,68 @@
"""
BaseMigration class to subclass in each migration module
"""
from pymongo.errors import OperationFailure
class BaseMigration:
"""Base Migration class."""
def __init__(self, mdb, migration_version="0001"):
self.mdb = mdb
self.migration_version = migration_version
async def get_db_version(self):
"""Get current db version from database."""
db_version = None
version_collection = self.mdb["version"]
version_record = await version_collection.find_one()
if not version_record:
return db_version
try:
db_version = version_record["version"]
except KeyError:
pass
return db_version
async def set_db_version(self):
"""Set db version to migration_version."""
version_collection = self.mdb["version"]
await version_collection.find_one_and_update(
{}, {"$set": {"version": self.migration_version}}, upsert=True
)
async def migrate_up_needed(self):
"""Verify migration up is needed and return boolean indicator."""
db_version = await self.get_db_version()
print(f"Current database version before migration: {db_version}")
print(f"Migration available to apply: {self.migration_version}")
# Databases from prior to migrations will not have a version set.
if not db_version:
return True
if db_version < self.migration_version:
return True
return False
async def migrate_up(self):
"""Perform migration up."""
raise NotImplementedError(
"Not implemented in base class - implement in subclass"
)
async def run(self):
"""Run migrations."""
if await self.migrate_up_needed():
print("Performing migration up", flush=True)
try:
await self.migrate_up()
await self.set_db_version()
except OperationFailure as err:
print(f"Error running migration {self.migration_version}: {err}")
return False
else:
print("No migration to apply - skipping", flush=True)
return False
print(f"Database successfully migrated to {self.migration_version}", flush=True)
return True

View File

@ -5,10 +5,14 @@ import os
from pymongo.errors import OperationFailure
from btrixcloud.migrations import BaseMigration
from btrixcloud.k8s.k8sapi import K8sAPI
class Migration:
MIGRATION_VERSION = "0001"
class Migration(BaseMigration):
"""Migration class."""
COLLECTIONS_AID_TO_OID = [
@ -19,42 +23,8 @@ class Migration:
"profiles",
]
MIGRATION_VERSION = "0001"
def __init__(self, mdb):
self.mdb = mdb
async def get_db_version(self):
"""Get current db version from database."""
db_version = None
version_collection = self.mdb["version"]
version_record = await version_collection.find_one()
if not version_record:
return db_version
try:
db_version = version_record["version"]
except KeyError:
pass
return db_version
async def set_db_version(self):
"""Set db version to version_number."""
version_collection = self.mdb["version"]
await version_collection.find_one_and_update(
{}, {"$set": {"version": self.MIGRATION_VERSION}}, upsert=True
)
async def migrate_up_needed(self):
"""Verify migration up is needed and return boolean indicator."""
db_version = await self.get_db_version()
print(f"Current database version before migration: {db_version}")
print(f"Migration available to apply: {self.MIGRATION_VERSION}")
# Databases from prior to migrations will not have a version set.
if not db_version:
return True
if db_version < self.MIGRATION_VERSION:
return True
return False
def __init__(self, mdb, migration_version=MIGRATION_VERSION):
super().__init__(mdb, migration_version)
async def migrate_up(self):
"""Perform migration up."""
@ -98,25 +68,3 @@ class Migration:
await k8s_api_instance.core_api.patch_namespaced_config_map(
name=item_name, namespace=crawler_namespace, body=item
)
def migrate_down(self):
"""Perform migration down."""
raise NotImplementedError("Downward migrations not yet added")
async def run(self):
"""Run migrations."""
if await self.migrate_up_needed():
print("Performing migration up", flush=True)
try:
await self.migrate_up()
await self.set_db_version()
except OperationFailure as err:
print(f"Error running migration {self.MIGRATION_VERSION}: {err}")
return False
else:
print("No migration to apply - skipping", flush=True)
return False
print(f"Database successfully migrated to {self.MIGRATION_VERSION}", flush=True)
return True

View File

@ -0,0 +1,27 @@
"""
Migration 0002 - Dropping CrawlConfig crawl stats
"""
from btrixcloud.migrations import BaseMigration
MIGRATION_VERSION = "0002"
class Migration(BaseMigration):
"""Migration class."""
def __init__(self, mdb, migration_version=MIGRATION_VERSION):
super().__init__(mdb, migration_version)
async def migrate_up(self):
"""Perform migration up.
Drop crawl statistics fields from crawl_config collection documents
as these are now generated dynamically from a join as needed in API
endpoints.
"""
crawl_configs = self.mdb["crawl_configs"]
await crawl_configs.update_many({}, {"$unset": {"crawlCount": 1}})
await crawl_configs.update_many({}, {"$unset": {"lastCrawlId": 1}})
await crawl_configs.update_many({}, {"$unset": {"lastCrawlTime": 1}})
await crawl_configs.update_many({}, {"$unset": {"lastCrawlState": 1}})