browsertrix/backend/btrixcloud/main.py
Ilya Kreymer fb3d88291f
Background Jobs Work (#1321)
Fixes #1252 

Supports a generic background job system, with two background jobs,
CreateReplicaJob and DeleteReplicaJob.
- CreateReplicaJob runs on new crawls, uploads, profiles and updates the
`replicas` array with the info about the replica after the job succeeds.
- DeleteReplicaJob deletes the replica.
- Both jobs are created from the new `replica_job.yaml` template. The
CreateReplicaJob sets secrets for primary storage + replica storage,
while DeleteReplicaJob only needs the replica storage.
- The job is processed in the operator when the job is finalized
(deleted), which should happen immediately when the job is done, either
because it succeeds or because the backoffLimit is reached (currently
set to 3).
- /jobs/ api lists all jobs using a paginated response, including filtering and sorting
- /jobs/<job id> returns details for a particular job
- tests: nightly tests updated to check create + delete replica jobs for crawls as well as uploads, job api endpoints
- tests: also fixes to timeouts in nightly tests to avoid crawls finishing too quickly.

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-11-02 13:02:17 -07:00

190 lines
5.1 KiB
Python

"""
main file for browsertrix-api system
supports docker and kubernetes based deployments of multiple browsertrix-crawlers
"""
import os
import asyncio
import sys
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from fastapi.routing import APIRouter
from .db import init_db, await_db_and_migrations, update_and_prepare_db
from .emailsender import EmailSender
from .invites import init_invites
from .auth import JWT_TOKEN_LIFETIME
from .users import init_users_api, init_user_manager
from .orgs import init_orgs_api
from .profiles import init_profiles_api
from .storages import init_storages_api
from .uploads import init_uploads_api
from .crawlconfigs import init_crawl_config_api
from .colls import init_collections_api
from .crawls import init_crawls_api
from .basecrawls import init_base_crawls_api
from .webhooks import init_event_webhooks_api
from .background_jobs import init_background_jobs_api
from .crawlmanager import CrawlManager
from .utils import run_once_lock, register_exit_handler, is_bool
API_PREFIX = "/api"
app_root = FastAPI(
docs_url=API_PREFIX + "/docs",
redoc_url=API_PREFIX + "/redoc",
openapi_url=API_PREFIX + "/openapi.json",
)
db_inited = {"inited": False}
# ============================================================================
# pylint: disable=too-many-locals, duplicate-code
def main():
"""init browsertrix cloud api"""
app = APIRouter()
email = EmailSender()
crawl_manager = None
dbclient, mdb = init_db()
settings = {
"registrationEnabled": is_bool(os.environ.get("REGISTRATION_ENABLED")),
"jwtTokenLifetime": JWT_TOKEN_LIFETIME,
"defaultBehaviorTimeSeconds": int(
os.environ.get("DEFAULT_BEHAVIOR_TIME_SECONDS", 300)
),
"defaultPageLoadTimeSeconds": int(
os.environ.get("DEFAULT_PAGE_LOAD_TIME_SECONDS", 120)
),
"maxPagesPerCrawl": int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)),
}
invites = init_invites(mdb, email)
user_manager = init_user_manager(mdb, email, invites)
current_active_user = init_users_api(app, user_manager)
org_ops = init_orgs_api(app, mdb, user_manager, invites, current_active_user)
event_webhook_ops = init_event_webhooks_api(mdb, org_ops, app_root)
# pylint: disable=import-outside-toplevel
if not os.environ.get("KUBERNETES_SERVICE_HOST"):
print(
"Sorry, the Browsertrix Cloud Backend must be run inside a Kubernetes environment.\
Kubernetes not detected (KUBERNETES_SERVICE_HOST is not set), Exiting"
)
sys.exit(1)
crawl_manager = CrawlManager()
storage_ops = init_storages_api(org_ops, crawl_manager)
background_job_ops = init_background_jobs_api(
mdb, org_ops, crawl_manager, storage_ops
)
profiles = init_profiles_api(
mdb,
org_ops,
crawl_manager,
storage_ops,
background_job_ops,
current_active_user,
)
crawl_config_ops = init_crawl_config_api(
dbclient,
mdb,
current_active_user,
user_manager,
org_ops,
crawl_manager,
profiles,
)
coll_ops = init_collections_api(app, mdb, org_ops, storage_ops, event_webhook_ops)
base_crawl_init = (
app,
current_active_user,
mdb,
user_manager,
org_ops,
crawl_manager,
crawl_config_ops,
coll_ops,
storage_ops,
event_webhook_ops,
background_job_ops,
)
base_crawl_ops = init_base_crawls_api(*base_crawl_init)
crawls = init_crawls_api(*base_crawl_init)
init_uploads_api(*base_crawl_init)
user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops)
background_job_ops.set_ops(base_crawl_ops, profiles)
crawl_config_ops.set_coll_ops(coll_ops)
# run only in first worker
if run_once_lock("btrix-init-db"):
asyncio.create_task(
update_and_prepare_db(
mdb,
user_manager,
org_ops,
crawls,
crawl_config_ops,
coll_ops,
invites,
storage_ops,
db_inited,
)
)
else:
asyncio.create_task(await_db_and_migrations(mdb, db_inited))
app.include_router(org_ops.router)
@app.get("/settings")
async def get_settings():
if not db_inited.get("inited"):
raise HTTPException(status_code=503, detail="not_ready_yet")
return settings
# internal routes
@app.get("/openapi.json", include_in_schema=False)
async def openapi() -> JSONResponse:
return JSONResponse(app_root.openapi())
@app_root.get("/healthz", include_in_schema=False)
async def healthz():
if not db_inited.get("inited"):
raise HTTPException(status_code=503, detail="not_ready_yet")
return {}
app_root.include_router(app, prefix=API_PREFIX)
# ============================================================================
@app_root.on_event("startup")
async def startup():
"""init on startup"""
register_exit_handler()
main()