browsertrix/backend/btrixcloud/main_op.py
Ilya Kreymer fb3d88291f
Background Jobs Work (#1321)
Fixes #1252 

Supports a generic background job system, with two background jobs,
CreateReplicaJob and DeleteReplicaJob.
- CreateReplicaJob runs on new crawls, uploads, profiles and updates the
`replicas` array with the info about the replica after the job succeeds.
- DeleteReplicaJob deletes the replica.
- Both jobs are created from the new `replica_job.yaml` template. The
CreateReplicaJob sets secrets for primary storage + replica storage,
while DeleteReplicaJob only needs the replica storage.
- The job is processed in the operator when the job is finalized
(deleted), which should happen immediately when the job is done, either
because it succeeds or because the backoffLimit is reached (currently
set to 3).
- /jobs/ api lists all jobs using a paginated response, including filtering and sorting
- /jobs/<job id> returns details for a particular job
- tests: nightly tests updated to check create + delete replica jobs for crawls as well as uploads, job api endpoints
- tests: also fixes to timeouts in nightly tests to avoid crawls finishing too quickly.

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-11-02 13:02:17 -07:00

109 lines
2.7 KiB
Python

""" entrypoint module for operator """
import os
import sys
from fastapi import FastAPI
from .crawlmanager import CrawlManager
from .db import init_db
from .emailsender import EmailSender
from .operator import init_operator_api
from .utils import register_exit_handler
from .invites import InviteOps
from .users import init_user_manager
from .orgs import OrgOps
from .colls import CollectionOps
from .crawlconfigs import CrawlConfigOps
from .crawls import CrawlOps
from .profiles import ProfileOps
from .storages import init_storages_api
from .webhooks import EventWebhookOps
from .background_jobs import BackgroundJobOps
app_root = FastAPI()
# ============================================================================
# pylint: disable=too-many-function-args, duplicate-code
def main():
"""main init"""
email = EmailSender()
crawl_manager = None
dbclient, mdb = init_db()
invite_ops = InviteOps(mdb, email)
user_manager = init_user_manager(mdb, email, invite_ops)
org_ops = OrgOps(mdb, invite_ops)
event_webhook_ops = EventWebhookOps(mdb, org_ops)
# pylint: disable=import-outside-toplevel
if not os.environ.get("KUBERNETES_SERVICE_HOST"):
print(
"Sorry, the Browsertrix Cloud Backend must be run inside a Kubernetes environment.\
Kubernetes not detected (KUBERNETES_SERVICE_HOST is not set), Exiting"
)
sys.exit(1)
crawl_manager = CrawlManager()
storage_ops = init_storages_api(org_ops, crawl_manager)
background_job_ops = BackgroundJobOps(mdb, org_ops, crawl_manager, storage_ops)
profile_ops = ProfileOps(
mdb, org_ops, crawl_manager, storage_ops, background_job_ops
)
crawl_config_ops = CrawlConfigOps(
dbclient,
mdb,
user_manager,
org_ops,
crawl_manager,
profile_ops,
)
user_manager.set_ops(org_ops, crawl_config_ops, None)
coll_ops = CollectionOps(mdb, crawl_manager, org_ops, event_webhook_ops)
crawl_ops = CrawlOps(
mdb,
user_manager,
org_ops,
crawl_manager,
crawl_config_ops,
coll_ops,
storage_ops,
event_webhook_ops,
background_job_ops,
)
background_job_ops.set_ops(crawl_ops, profile_ops)
return init_operator_api(
app_root,
crawl_config_ops,
crawl_ops,
org_ops,
coll_ops,
storage_ops,
event_webhook_ops,
background_job_ops,
)
# ============================================================================
@app_root.on_event("startup")
async def startup():
"""init on startup"""
register_exit_handler()
oper = main()
await oper.async_init()