browsertrix/backend/btrixcloud/operator/cronjobs.py
Vinzenz Sinapius bb6e703f6a
Configure browsertrix proxies (#1847)
Resolves #1354

Supports crawling through pre-configured proxy servers, allowing users to select which proxy servers to use (requires browsertrix crawler 1.3+)

Config:
- proxies defined in btrix-proxies subchart
- can be configured via btrix-proxies key or separate proxies.yaml file via separate subchart
- proxies list refreshed automatically if crawler_proxies.json changes if subchart is deployed
- support for ssh and socks5 proxies
- proxy keys added to secrets in subchart
- support for default proxy to be always used if no other proxy configured, prevent starting cluster if default proxy not available
- prevent starting manual crawl if previously configured proxy is no longer available, return error
- force 'btrix' username and group name on browsertrix-crawler non-root user to support ssh

Operator:
- support crawling through proxies, pass proxyId in CrawlJob
- support running profile browsers which designated proxy, pass proxyId to ProfileJob
- prevent starting scheduled crawl if previously configured proxy is no longer available

API / Access:
- /api/orgs/all/crawlconfigs/crawler-proxies - get all proxies (superadmin only)
- /api/orgs/{oid}/crawlconfigs/crawler-proxies - get proxies available to particular org
- /api/orgs/{oid}/proxies - update allowed proxies for particular org (superadmin only)
- superadmin can configure which orgs can use which proxies, stored on the org
- superadmin can also allow an org to access all 'shared' proxies, to avoid having to allow a shared proxy on each org.

UI:
- Superadmin has 'Edit Proxies' dialog to configure for each org if it has: dedicated proxies, has access to shared proxies.
- User can select a proxy in Crawl Workflow browser settings
- Users can choose to launch a browser profile with a particular proxy
- Display which proxy is used to create profile in profile selector
- Users can choose with default proxy to use for new workflows in Crawling Defaults

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-10-02 18:35:45 -07:00

198 lines
6.4 KiB
Python

""" Operator handler for crawl CronJobs """
from uuid import UUID
from typing import Optional
import yaml
from btrixcloud.utils import date_to_str, dt_now
from .models import MCDecoratorSyncData, CJS, MCDecoratorSyncResponse
from .baseoperator import BaseOperator
from ..models import CrawlConfig
# pylint: disable=too-many-locals
# ============================================================================
class CronJobOperator(BaseOperator):
"""CronJob Operator"""
def init_routes(self, app):
"""init routes for crawl CronJob decorator"""
@app.post("/op/cronjob/sync")
async def mc_sync_cronjob_crawls(
data: MCDecoratorSyncData,
) -> MCDecoratorSyncResponse:
return await self.sync_cronjob_crawl(data)
def get_finished_response(
self, metadata: dict[str, str], set_status=True, finished: Optional[str] = None
) -> MCDecoratorSyncResponse:
"""get final response to indicate cronjob created job is finished"""
if not finished:
finished = date_to_str(dt_now())
status = None
# set status on decorated job to indicate that its finished
if set_status:
status = {
"succeeded": 1,
"startTime": metadata.get("creationTimestamp"),
"completionTime": finished,
}
return MCDecoratorSyncResponse(
attachments=[],
# set on job to match default behavior when job finishes
annotations={"finished": finished},
status=status,
)
# pylint: disable=too-many-arguments
async def make_new_crawljob(
self,
cid: UUID,
oid: Optional[UUID],
userid: Optional[UUID],
crawl_id: str,
metadata: dict[str, str],
state: Optional[str],
) -> MCDecoratorSyncResponse:
"""declare new CrawlJob from cid, based on db data"""
# cronjob doesn't exist yet
crawlconfig: CrawlConfig
try:
crawlconfig = await self.crawl_config_ops.get_crawl_config(cid, oid)
# pylint: disable=bare-except
except:
print(
f"error: no crawlconfig {cid}. skipping scheduled job. old cronjob left over?"
)
return self.get_finished_response(metadata)
# get org
oid = crawlconfig.oid
org = await self.org_ops.get_org_by_id(oid)
# db create
user = None
if not userid:
userid = crawlconfig.modifiedBy
if userid:
user = await self.user_ops.get_by_id(userid)
if not userid or not user:
print(f"error: missing user for id {userid}")
return self.get_finished_response(metadata)
warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig)
if org.readOnly:
print(
f'org "{org.slug}" set to read-only. skipping scheduled crawl for workflow {cid}'
)
return self.get_finished_response(metadata)
if crawlconfig.proxyId and not self.crawl_config_ops.get_crawler_proxy(
crawlconfig.proxyId
):
print(
f"proxy {crawlconfig.proxyId} missing, skipping scheduled crawl for "
+ f'workflow {cid} in "{org.slug}"'
)
return self.get_finished_response(metadata)
# if no db state, add crawl in the db
if not state:
await self.crawl_config_ops.add_new_crawl(
crawl_id,
crawlconfig,
user,
manual=False,
)
print("Scheduled Crawl Created: " + crawl_id)
profile_filename = await self.crawl_config_ops.get_profile_filename(
crawlconfig.profileid, org
)
crawl_id, crawljob = self.k8s.new_crawl_job_yaml(
cid=str(cid),
userid=str(userid),
oid=str(oid),
storage=str(org.storage),
crawler_channel=crawlconfig.crawlerChannel or "default",
scale=crawlconfig.scale,
crawl_timeout=crawlconfig.crawlTimeout,
max_crawl_size=crawlconfig.maxCrawlSize,
manual=False,
crawl_id=crawl_id,
warc_prefix=warc_prefix,
storage_filename=self.crawl_config_ops.default_filename_template,
profile_filename=profile_filename or "",
proxy_id=crawlconfig.proxyId or "",
)
return MCDecoratorSyncResponse(attachments=list(yaml.safe_load_all(crawljob)))
async def sync_cronjob_crawl(
self, data: MCDecoratorSyncData
) -> MCDecoratorSyncResponse:
"""create crawljobs from a job object spawned by cronjob"""
metadata = data.object["metadata"]
labels = metadata.get("labels", {})
cid: str = labels.get("btrix.crawlconfig", "")
oid: str = labels.get("btrix.org", "")
userid: str = labels.get("btrix.userid", "")
if not cid:
print("error: cronjob missing 'cid', invalid cronjob")
return self.get_finished_response(metadata)
name = metadata.get("name")
crawl_id = name
actual_state, finished = await self.crawl_ops.get_crawl_state(
crawl_id, is_qa=False
)
if finished:
finished_str = date_to_str(finished)
set_status = False
# mark job as completed
if not data.object["status"].get("succeeded"):
print("Cron Job Complete!", finished)
set_status = True
return self.get_finished_response(metadata, set_status, finished_str)
crawljobs = data.attachments[CJS]
crawljob_id = f"crawljob-{crawl_id}"
if crawljob_id not in crawljobs:
response = await self.make_new_crawljob(
UUID(cid),
UUID(oid) if oid else None,
UUID(userid) if userid else None,
crawl_id,
metadata,
actual_state,
)
else:
# just return existing crawljob, filter metadata, remove status and annotations
crawljob = crawljobs[crawljob_id]
crawljob["metadata"] = {
"name": crawljob["metadata"]["name"],
"labels": crawljob["metadata"].get("labels"),
}
crawljob.pop("status", "")
response = MCDecoratorSyncResponse(attachments=[crawljob])
return response