* Precompute config crawl stats * Includes a database migration to move preciously dynamically computed crawl stats for workflows into the CrawlConfig model. * Add crawls.finished descending index * Add last crawl fields to workflow tests
178 lines
5.3 KiB
Python
178 lines
5.3 KiB
Python
"""
|
|
Browsertrix API Mongo DB initialization
|
|
"""
|
|
import importlib.util
|
|
import os
|
|
import urllib
|
|
from typing import Optional
|
|
|
|
import motor.motor_asyncio
|
|
from pydantic import BaseModel, UUID4
|
|
from pymongo.errors import InvalidName
|
|
|
|
from .migrations import BaseMigration
|
|
|
|
|
|
CURR_DB_VERSION = "0006"
|
|
|
|
|
|
# ============================================================================
|
|
def resolve_db_url():
|
|
"""get the mongo db url, either from MONGO_DB_URL or
|
|
from separate username, password and host settings"""
|
|
db_url = os.environ.get("MONGO_DB_URL")
|
|
if db_url:
|
|
return db_url
|
|
|
|
mongo_user = urllib.parse.quote_plus(os.environ["MONGO_INITDB_ROOT_USERNAME"])
|
|
mongo_pass = urllib.parse.quote_plus(os.environ["MONGO_INITDB_ROOT_PASSWORD"])
|
|
mongo_host = os.environ["MONGO_HOST"]
|
|
|
|
return f"mongodb://{mongo_user}:{mongo_pass}@{mongo_host}:27017"
|
|
|
|
|
|
# ============================================================================
|
|
def init_db():
|
|
"""initialize the mongodb connector"""
|
|
|
|
db_url = resolve_db_url()
|
|
|
|
client = motor.motor_asyncio.AsyncIOMotorClient(
|
|
db_url,
|
|
uuidRepresentation="standard",
|
|
connectTimeoutMS=120000,
|
|
serverSelectionTimeoutMS=120000,
|
|
)
|
|
|
|
mdb = client["browsertrixcloud"]
|
|
|
|
return client, mdb
|
|
|
|
|
|
# ============================================================================
|
|
async def update_and_prepare_db(
|
|
# pylint: disable=R0913
|
|
mdb,
|
|
user_manager,
|
|
org_ops,
|
|
crawl_ops,
|
|
crawl_config_ops,
|
|
coll_ops,
|
|
invite_ops,
|
|
):
|
|
"""Prepare database for application.
|
|
|
|
- Run database migrations
|
|
- Recreate indexes
|
|
- Create/update superuser
|
|
- Create/update default org
|
|
|
|
"""
|
|
print("Database setup started", flush=True)
|
|
if await run_db_migrations(mdb, user_manager):
|
|
await drop_indexes(mdb)
|
|
await create_indexes(org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops)
|
|
await user_manager.create_super_user()
|
|
await org_ops.create_default_org()
|
|
print("Database updated and ready", flush=True)
|
|
|
|
|
|
# ============================================================================
|
|
async def run_db_migrations(mdb, user_manager):
|
|
"""Run database migrations."""
|
|
|
|
# if first run, just set version and exit
|
|
if not await user_manager.get_superuser():
|
|
base_migration = BaseMigration(mdb, CURR_DB_VERSION)
|
|
await base_migration.set_db_version()
|
|
print(
|
|
"New DB, no migration needed, set version to: " + CURR_DB_VERSION,
|
|
flush=True,
|
|
)
|
|
return False
|
|
|
|
migrations_run = False
|
|
migrations_path = "/app/btrixcloud/migrations"
|
|
module_files = [
|
|
f
|
|
for f in sorted(os.listdir(migrations_path))
|
|
if not os.path.isdir(os.path.join(migrations_path, f))
|
|
and not f.startswith("__")
|
|
]
|
|
for module_file in module_files:
|
|
module_path = os.path.join(migrations_path, module_file)
|
|
try:
|
|
migration_name = os.path.basename(module_file).rstrip(".py")
|
|
spec = importlib.util.spec_from_file_location(
|
|
f".migrations.{migration_name}", module_path
|
|
)
|
|
assert spec
|
|
migration_module = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(migration_module)
|
|
migration = migration_module.Migration(mdb)
|
|
if await migration.run():
|
|
migrations_run = True
|
|
except ImportError as err:
|
|
print(
|
|
f"Error importing Migration class from module {module_file}: {err}",
|
|
flush=True,
|
|
)
|
|
return migrations_run
|
|
|
|
|
|
# ============================================================================
|
|
async def drop_indexes(mdb):
|
|
"""Drop all database indexes."""
|
|
print("Dropping database indexes", flush=True)
|
|
collection_names = await mdb.list_collection_names()
|
|
for collection in collection_names:
|
|
try:
|
|
current_coll = mdb[collection]
|
|
await current_coll.drop_indexes()
|
|
print(f"Indexes for collection {collection} dropped")
|
|
except InvalidName:
|
|
continue
|
|
|
|
|
|
# ============================================================================
|
|
async def create_indexes(org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops):
|
|
"""Create database indexes."""
|
|
print("Creating database indexes", flush=True)
|
|
await org_ops.init_index()
|
|
await crawl_ops.init_index()
|
|
await crawl_config_ops.init_index()
|
|
await coll_ops.init_index()
|
|
await invite_ops.init_index()
|
|
|
|
|
|
# ============================================================================
|
|
class BaseMongoModel(BaseModel):
|
|
"""Base pydantic model that is also a mongo doc"""
|
|
|
|
id: Optional[UUID4]
|
|
|
|
@property
|
|
def id_str(self):
|
|
"""Return id as str"""
|
|
return str(self.id)
|
|
|
|
@classmethod
|
|
def from_dict(cls, data):
|
|
"""convert dict from mongo to a class"""
|
|
if not data:
|
|
return None
|
|
data["id"] = data.pop("_id")
|
|
return cls(**data)
|
|
|
|
def serialize(self, **opts):
|
|
"""convert class to dict"""
|
|
return self.dict(
|
|
exclude_unset=True, exclude_defaults=True, exclude_none=True, **opts
|
|
)
|
|
|
|
def to_dict(self, **opts):
|
|
"""convert to dict for mongo"""
|
|
res = self.dict(**opts)
|
|
res["_id"] = res.pop("id", "")
|
|
return res
|