browsertrix/backend/btrixcloud/main.py
Ilya Kreymer e13c3bfb48
move db migrations to initContainers: (#2449)
- should avoid gunicorn worker timeouts for long running migrations,
also fixes #2439
- add main_migrations as entrypoint to just run db migrations, using
existing init_ops() call
- first run 'migrations' container with same resources as 'app' and 'op'
- additional typing for initializing db
- cleanup unused code related to running only once, waiting for db to be ready
- fixes #2447
2025-03-03 13:13:15 -08:00

341 lines
9.3 KiB
Python

"""
main file for browsertrix-api system
supports docker and kubernetes based deployments of multiple browsertrix-crawlers
"""
import os
import asyncio
import sys
from typing import List, Optional
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from fastapi.routing import APIRouter
from fastapi.openapi.utils import get_openapi
from fastapi.openapi.docs import get_swagger_ui_html, get_redoc_html
from pydantic import BaseModel
from .db import init_db, await_db_and_migrations
from .emailsender import EmailSender
from .invites import init_invites
from .auth import JWT_TOKEN_LIFETIME
from .users import init_users_api, init_user_manager
from .orgs import init_orgs_api
from .profiles import init_profiles_api
from .storages import init_storages_api
from .uploads import init_uploads_api
from .crawlconfigs import init_crawl_config_api
from .colls import init_collections_api
from .crawls import init_crawls_api
from .basecrawls import init_base_crawls_api
from .webhooks import init_event_webhooks_api
from .background_jobs import init_background_jobs_api
from .pages import init_pages_api
from .subs import init_subs_api
from .crawlmanager import CrawlManager
from .utils import register_exit_handler, is_bool
from .version import __version__
API_PREFIX = "/api"
OPENAPI_URL = API_PREFIX + "/openapi.json"
app_root = FastAPI(docs_url=None, redoc_url=None, OPENAPI_URL=OPENAPI_URL)
db_inited = {"inited": False}
tags = [
"crawlconfigs",
"crawls",
"settings",
"auth",
"users",
"organizations",
"profiles",
"uploads",
"all-crawls",
"qa",
"pages",
"collections",
"webhooks",
"jobs",
"invites",
"subscriptions",
]
# ============================================================================
def make_schema():
"""make custom openapi schema"""
schema = get_openapi(
title="Browsertrix",
description="""\
The Browsertrix API provides access to all aspects of the Browsertrix app.
See [https://docs.browsertrix.com/](https://docs.browsertrix.com/) for more info on deploying Browsertrix\
""",
summary="Browsertrix Crawling System API",
version=__version__,
terms_of_service="https://webrecorder.net/legal/browsertrix-terms-and-conditions/",
contact={
"name": "Browsertrix",
"url": "https://webrecorder.net/browsertrix",
"email": "info@webrecorder.net",
},
license_info={
"name": "AGPL v3",
"url": "https://www.gnu.org/licenses/agpl-3.0.en.html",
},
routes=app_root.routes,
webhooks=app_root.webhooks.routes,
tags=[{"name": tag} for tag in tags],
)
schema["info"]["x-logo"] = {"url": "/docs-logo.svg"}
return schema
# ============================================================================
class SettingsResponse(BaseModel):
"""/api/settings response model"""
registrationEnabled: bool
jwtTokenLifetime: int
defaultBehaviorTimeSeconds: int
defaultPageLoadTimeSeconds: int
maxPagesPerCrawl: int
numBrowsers: int
maxScale: int
billingEnabled: bool
signUpUrl: str = ""
salesEmail: str = ""
supportEmail: str = ""
localesEnabled: Optional[List[str]]
# ============================================================================
# pylint: disable=too-many-locals, duplicate-code
def main() -> None:
"""init browsertrix api"""
app = APIRouter()
email = EmailSender()
crawl_manager = None
dbclient, mdb = init_db()
settings = SettingsResponse(
registrationEnabled=is_bool(os.environ.get("REGISTRATION_ENABLED")),
jwtTokenLifetime=JWT_TOKEN_LIFETIME,
defaultBehaviorTimeSeconds=int(
os.environ.get("DEFAULT_BEHAVIOR_TIME_SECONDS", 300)
),
defaultPageLoadTimeSeconds=int(
os.environ.get("DEFAULT_PAGE_LOAD_TIME_SECONDS", 120)
),
maxPagesPerCrawl=int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)),
numBrowsers=int(os.environ.get("NUM_BROWSERS", 1)),
maxScale=int(os.environ.get("MAX_CRAWL_SCALE", 3)),
billingEnabled=is_bool(os.environ.get("BILLING_ENABLED")),
signUpUrl=os.environ.get("SIGN_UP_URL", ""),
salesEmail=os.environ.get("SALES_EMAIL", ""),
supportEmail=os.environ.get("EMAIL_SUPPORT", ""),
localesEnabled=(
[lang.strip() for lang in os.environ.get("LOCALES_ENABLED", "").split(",")]
if os.environ.get("LOCALES_ENABLED")
else None
),
)
invites = init_invites(mdb, email)
user_manager = init_user_manager(mdb, email, invites)
current_active_user, shared_secret_or_active_user = init_users_api(
app, user_manager
)
org_ops = init_orgs_api(
app,
mdb,
user_manager,
invites,
current_active_user,
)
init_subs_api(app, mdb, org_ops, user_manager, shared_secret_or_active_user)
event_webhook_ops = init_event_webhooks_api(mdb, org_ops, app_root)
# pylint: disable=import-outside-toplevel
if not os.environ.get("KUBERNETES_SERVICE_HOST"):
print(
"Sorry, the Browsertrix Backend must be run inside a Kubernetes environment.\
Kubernetes not detected (KUBERNETES_SERVICE_HOST is not set), Exiting"
)
sys.exit(1)
crawl_manager = CrawlManager()
storage_ops = init_storages_api(
org_ops, crawl_manager, app, mdb, current_active_user
)
background_job_ops = init_background_jobs_api(
app,
mdb,
email,
user_manager,
org_ops,
crawl_manager,
storage_ops,
current_active_user,
)
profiles = init_profiles_api(
mdb,
org_ops,
crawl_manager,
storage_ops,
background_job_ops,
current_active_user,
)
crawl_config_ops = init_crawl_config_api(
app,
dbclient,
mdb,
current_active_user,
user_manager,
org_ops,
crawl_manager,
profiles,
)
coll_ops = init_collections_api(
app, mdb, org_ops, storage_ops, event_webhook_ops, current_active_user
)
base_crawl_init = (
app,
current_active_user,
# to basecrawls
mdb,
user_manager,
org_ops,
crawl_config_ops,
coll_ops,
storage_ops,
event_webhook_ops,
background_job_ops,
)
base_crawl_ops = init_base_crawls_api(*base_crawl_init)
crawls = init_crawls_api(crawl_manager, *base_crawl_init)
upload_ops = init_uploads_api(*base_crawl_init)
page_ops = init_pages_api(
app,
mdb,
crawls,
org_ops,
storage_ops,
background_job_ops,
coll_ops,
current_active_user,
)
base_crawl_ops.set_page_ops(page_ops)
crawls.set_page_ops(page_ops)
upload_ops.set_page_ops(page_ops)
org_ops.set_ops(base_crawl_ops, profiles, coll_ops, background_job_ops, page_ops)
user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops)
background_job_ops.set_ops(base_crawl_ops, profiles)
crawl_config_ops.set_coll_ops(coll_ops)
coll_ops.set_page_ops(page_ops)
# await db init, migrations should have already completed in init containers
asyncio.create_task(await_db_and_migrations(mdb, db_inited))
app.include_router(org_ops.router)
@app.get("/settings", tags=["settings"], response_model=SettingsResponse)
async def get_settings() -> SettingsResponse:
if not db_inited.get("inited"):
raise HTTPException(status_code=503, detail="not_ready_yet")
return settings
# internal routes
@app.get("/openapi.json", include_in_schema=False)
async def openapi() -> JSONResponse:
return JSONResponse(app_root.openapi())
# Used for startup
# Returns 200 only when db is available + migrations are done
@app_root.get("/healthzStartup", include_in_schema=False)
async def healthz_startup():
if not db_inited.get("inited"):
raise HTTPException(status_code=503, detail="not_ready_yet")
return {}
# Used for readiness + liveness
# Always returns 200 while running
@app_root.get("/healthz", include_in_schema=False)
async def healthz():
return {}
app_root.include_router(app, prefix=API_PREFIX)
# API Configurations -- needed to provide custom favicon
@app_root.get(API_PREFIX + "/docs", include_in_schema=False)
def overridden_swagger():
return get_swagger_ui_html(
openapi_url=OPENAPI_URL,
title="Browsertrix API",
swagger_favicon_url="/favicon.ico",
)
@app_root.get(API_PREFIX + "/redoc", include_in_schema=False)
def overridden_redoc():
return get_redoc_html(
openapi_url=OPENAPI_URL,
title="Browsertrix API",
redoc_favicon_url="/favicon.ico",
)
def get_api_schema():
if not app_root.openapi_schema:
app_root.openapi_schema = make_schema()
return app_root.openapi_schema
app_root.openapi = get_api_schema # type: ignore
# ============================================================================
@app_root.on_event("startup")
async def startup():
"""init on startup"""
register_exit_handler()
main()