browsertrix/backend/btrixcloud/main_op.py
Tessa Walsh 14189b7cfb
Add crawl pages and related API endpoints (#1516)
Fixes #1502 

- Adds pages to database as they get added to Redis during crawl
- Adds migration to add pages to database for older crawls from
pages.jsonl and extraPages.jsonl files in WACZ
- Adds GET, list GET, and PATCH update endpoints for pages
- Adds POST (add), PATCH, and POST (delete) endpoints for page notes,
each with their own id, timestamp, and user info in addition to text
- Adds page_ops methods for 1. adding resources/urls to page, and 2.
adding automated heuristics and supplemental info (mime, type, etc.) to
page (for use in crawl QA job)
- Modifies `Migration` class to accept kwargs so that we can pass in ops
classes as needed for migrations
- Deletes WACZ files and pages from database for failed crawls during
crawl_finished process
- Deletes crawl pages when a crawl is deleted

Note: Requires a crawler version 1.0.0 beta3 or later, with support for
`--writePagesToRedis` to populate pages at crawl completion. Beta 4 is
configured in the test chart, which should be upgraded to stable 1.0.0
when it's released.

Connected to https://github.com/webrecorder/browsertrix-crawler/pull/464

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2024-02-28 12:11:35 -05:00

117 lines
2.8 KiB
Python

""" entrypoint module for operator """
import os
import sys
from fastapi import FastAPI
from .crawlmanager import CrawlManager
from .db import init_db
from .emailsender import EmailSender
from .operator import init_operator_api
from .utils import register_exit_handler
from .invites import InviteOps
from .users import init_user_manager
from .orgs import OrgOps
from .colls import CollectionOps
from .crawlconfigs import CrawlConfigOps
from .crawls import CrawlOps
from .profiles import ProfileOps
from .storages import init_storages_api
from .webhooks import EventWebhookOps
from .background_jobs import BackgroundJobOps
from .pages import PageOps
app_root = FastAPI()
# ============================================================================
# pylint: disable=too-many-function-args, duplicate-code
def main():
"""main init"""
email = EmailSender()
crawl_manager = None
dbclient, mdb = init_db()
invite_ops = InviteOps(mdb, email)
user_manager = init_user_manager(mdb, email, invite_ops)
org_ops = OrgOps(mdb, invite_ops)
event_webhook_ops = EventWebhookOps(mdb, org_ops)
# pylint: disable=import-outside-toplevel
if not os.environ.get("KUBERNETES_SERVICE_HOST"):
print(
"Sorry, the Browsertrix Cloud Backend must be run inside a Kubernetes environment.\
Kubernetes not detected (KUBERNETES_SERVICE_HOST is not set), Exiting"
)
sys.exit(1)
crawl_manager = CrawlManager()
storage_ops = init_storages_api(org_ops, crawl_manager)
background_job_ops = BackgroundJobOps(
mdb, email, user_manager, org_ops, crawl_manager, storage_ops
)
profile_ops = ProfileOps(
mdb, org_ops, crawl_manager, storage_ops, background_job_ops
)
crawl_config_ops = CrawlConfigOps(
dbclient,
mdb,
user_manager,
org_ops,
crawl_manager,
profile_ops,
)
user_manager.set_ops(org_ops, crawl_config_ops, None)
coll_ops = CollectionOps(mdb, crawl_manager, org_ops, event_webhook_ops)
crawl_ops = CrawlOps(
mdb,
user_manager,
org_ops,
crawl_manager,
crawl_config_ops,
coll_ops,
storage_ops,
event_webhook_ops,
background_job_ops,
)
page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops)
crawl_ops.set_page_ops(page_ops)
background_job_ops.set_ops(crawl_ops, profile_ops)
return init_operator_api(
app_root,
crawl_config_ops,
crawl_ops,
org_ops,
coll_ops,
storage_ops,
event_webhook_ops,
background_job_ops,
page_ops,
)
# ============================================================================
@app_root.on_event("startup")
async def startup():
"""init on startup"""
register_exit_handler()
oper = main()
await oper.async_init()