browsertrix/backend/btrixcloud/main.py
Ilya Kreymer 00eb62214d
Uploads API: BaseCrawl refactor + Initial support for /uploads endpoint (#937)
* basecrawl refactor: make crawls db more generic, supporting different types of 'base crawls': crawls, uploads, manual archives
- move shared functionality to basecrawl.py
- create a base BaseCrawl object, which contains start / finish time, metadata and files array
- create BaseCrawlOps, base class for CrawlOps, which supports base crawl deletion, querying and collection add/remove

* uploads api: (part of #929)
- new UploadCrawl object which extends BaseCrawl, has name and description
- support multipart form data data upload to /uploads/formdata
- support streaming upload of a single file via /uploads/stream, using botocore multipart upload to upload to s3-endpoint in parts
- require 'filename' param to set upload filename for streaming uploads (otherwise use form data names)
- sanitize filename, place uploads in /uploads/<uuid>/<sanitized-filename>-<random>.wacz
- uploads have internal id 'upload-<uuid>'
- create UploadedCrawl object with CrawlFiles pointing to the newly uploaded files, set state to 'complete'
- handle upload failures, abort multipart upload
- ensure uploads added within org bucket path
- return id / added when adding new UploadedCrawl
- support listing, deleting, and patch /uploads
- support upload details via /replay.json to support for replay
- add support for 'replaceId=<id>', which would remove all previous files in upload after new upload succeeds. if replaceId doesn't exist, create new upload. (only for stream endpoint so far).
- support patching upload metadata: notes, tags and name on uploads (UpdateUpload extends UpdateCrawl and adds 'name')

* base crawls api: Add /all-crawls list and delete endpoints for all crawl types (without resources)
- support all-crawls/<id>/replay.json with resources
- Use ListCrawlOut model for /all-crawls list endpoint
- Extend BaseCrawlOut from ListCrawlOut, add type
- use 'type: crawl' for crawls and 'type: upload' for uploads
- migration: ensure all previous crawl objects / missing type are set to 'type: crawl'
- indexes: add db indices on 'type' field and with 'type' field and oid, cid, finished, state

* tests: add test for multipart and streaming upload, listing uploads, deleting upload
- add sample WACZ for upload testing: 'example.wacz' and 'example-2.wacz'

* collections: support adding and remove both crawls and uploads via base crawl
- include collection_ids in /all-crawls list
- collections replay.json can include both crawls and uploads

bump version to 1.6.0-beta.2
---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-07-07 09:13:26 -07:00

171 lines
4.6 KiB
Python

"""
main file for browsertrix-api system
supports docker and kubernetes based deployments of multiple browsertrix-crawlers
"""
import os
import asyncio
import sys
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from fastapi.routing import APIRouter
from .db import init_db, ping_db, update_and_prepare_db
from .emailsender import EmailSender
from .invites import init_invites
from .users import init_users_api, init_user_manager, JWT_TOKEN_LIFETIME
from .orgs import init_orgs_api
from .profiles import init_profiles_api
from .storages import init_storages_api
from .uploads import init_uploads_api
from .crawlconfigs import init_crawl_config_api
from .colls import init_collections_api
from .crawls import init_crawls_api
from .basecrawls import init_base_crawls_api
from .crawlmanager import CrawlManager
from .utils import run_once_lock, register_exit_handler
API_PREFIX = "/api"
app_root = FastAPI(
docs_url=API_PREFIX + "/docs",
redoc_url=API_PREFIX + "/redoc",
openapi_url=API_PREFIX + "/openapi.json",
)
db_inited = {"inited": False}
# ============================================================================
# pylint: disable=too-many-locals
def main():
"""init browsertrix cloud api"""
app = APIRouter()
email = EmailSender()
crawl_manager = None
dbclient, mdb = init_db()
settings = {
"registrationEnabled": os.environ.get("REGISTRATION_ENABLED") == "1",
"jwtTokenLifetime": JWT_TOKEN_LIFETIME,
"defaultBehaviorTimeSeconds": int(
os.environ.get("DEFAULT_BEHAVIOR_TIME_SECONDS", 300)
),
"defaultPageLoadTimeSeconds": int(
os.environ.get("DEFAULT_PAGE_LOAD_TIME_SECONDS", 120)
),
"maxPagesPerCrawl": int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)),
}
invites = init_invites(mdb, email)
user_manager = init_user_manager(mdb, email, invites)
fastapi_users = init_users_api(app, user_manager)
current_active_user = fastapi_users.current_user(active=True)
org_ops = init_orgs_api(app, mdb, user_manager, invites, current_active_user)
user_manager.set_org_ops(org_ops)
# pylint: disable=import-outside-toplevel
if not os.environ.get("KUBERNETES_SERVICE_HOST"):
print(
"Sorry, the Browsertrix Cloud Backend must be run inside a Kubernetes environment.\
Kubernetes not detected (KUBERNETES_SERVICE_HOST is not set), Exiting"
)
sys.exit(1)
crawl_manager = CrawlManager()
init_storages_api(org_ops, crawl_manager, current_active_user)
init_uploads_api(
app, mdb, user_manager, crawl_manager, org_ops, current_active_user
)
init_base_crawls_api(
app, mdb, user_manager, crawl_manager, org_ops, current_active_user
)
profiles = init_profiles_api(mdb, crawl_manager, org_ops, current_active_user)
crawl_config_ops = init_crawl_config_api(
dbclient,
mdb,
current_active_user,
user_manager,
org_ops,
crawl_manager,
profiles,
)
crawls = init_crawls_api(
app,
mdb,
user_manager,
crawl_manager,
crawl_config_ops,
org_ops,
current_active_user,
)
coll_ops = init_collections_api(app, mdb, crawls, org_ops, crawl_manager)
crawl_config_ops.set_coll_ops(coll_ops)
# run only in first worker
if run_once_lock("btrix-init-db"):
asyncio.create_task(
update_and_prepare_db(
mdb,
user_manager,
org_ops,
crawls,
crawl_config_ops,
coll_ops,
invites,
db_inited,
)
)
else:
asyncio.create_task(ping_db(mdb, db_inited))
app.include_router(org_ops.router)
@app.get("/settings")
async def get_settings():
if not db_inited.get("inited"):
raise HTTPException(status_code=503, detail="not_ready_yet")
return settings
# internal routes
@app.get("/openapi.json", include_in_schema=False)
async def openapi() -> JSONResponse:
return JSONResponse(app_root.openapi())
@app_root.get("/healthz", include_in_schema=False)
async def healthz():
if not db_inited.get("inited"):
raise HTTPException(status_code=503, detail="not_ready_yet")
return {}
app_root.include_router(app, prefix=API_PREFIX)
# ============================================================================
@app_root.on_event("startup")
async def startup():
"""init on startup"""
register_exit_handler()
main()