Resolves #1354 Supports crawling through pre-configured proxy servers, allowing users to select which proxy servers to use (requires browsertrix crawler 1.3+) Config: - proxies defined in btrix-proxies subchart - can be configured via btrix-proxies key or separate proxies.yaml file via separate subchart - proxies list refreshed automatically if crawler_proxies.json changes if subchart is deployed - support for ssh and socks5 proxies - proxy keys added to secrets in subchart - support for default proxy to be always used if no other proxy configured, prevent starting cluster if default proxy not available - prevent starting manual crawl if previously configured proxy is no longer available, return error - force 'btrix' username and group name on browsertrix-crawler non-root user to support ssh Operator: - support crawling through proxies, pass proxyId in CrawlJob - support running profile browsers which designated proxy, pass proxyId to ProfileJob - prevent starting scheduled crawl if previously configured proxy is no longer available API / Access: - /api/orgs/all/crawlconfigs/crawler-proxies - get all proxies (superadmin only) - /api/orgs/{oid}/crawlconfigs/crawler-proxies - get proxies available to particular org - /api/orgs/{oid}/proxies - update allowed proxies for particular org (superadmin only) - superadmin can configure which orgs can use which proxies, stored on the org - superadmin can also allow an org to access all 'shared' proxies, to avoid having to allow a shared proxy on each org. UI: - Superadmin has 'Edit Proxies' dialog to configure for each org if it has: dedicated proxies, has access to shared proxies. - User can select a proxy in Crawl Workflow browser settings - Users can choose to launch a browser profile with a particular proxy - Display which proxy is used to create profile in profile selector - Users can choose with default proxy to use for new workflows in Crawling Defaults --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com> Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
64 lines
2.2 KiB
Python
64 lines
2.2 KiB
Python
""" Operator handler for ProfileJobs """
|
|
|
|
from btrixcloud.utils import str_to_date, dt_now
|
|
|
|
from btrixcloud.models import StorageRef
|
|
|
|
from .models import MCSyncData
|
|
from .baseoperator import BaseOperator
|
|
|
|
|
|
# ============================================================================
|
|
class ProfileOperator(BaseOperator):
|
|
"""ProfileOperator"""
|
|
|
|
def init_routes(self, app):
|
|
"""init routes for this operator"""
|
|
|
|
@app.post("/op/profilebrowsers/sync")
|
|
async def mc_sync_profile_browsers(data: MCSyncData):
|
|
return await self.sync_profile_browsers(data)
|
|
|
|
async def sync_profile_browsers(self, data: MCSyncData):
|
|
"""sync profile browsers"""
|
|
spec = data.parent.get("spec", {})
|
|
|
|
expire_time = str_to_date(spec.get("expireTime"))
|
|
browserid = spec.get("id")
|
|
|
|
if expire_time and dt_now() >= expire_time:
|
|
self.run_task(self.k8s.delete_profile_browser(browserid))
|
|
return {"status": {}, "children": []}
|
|
|
|
params = {}
|
|
params.update(self.k8s.shared_params)
|
|
params["id"] = browserid
|
|
params["userid"] = spec.get("userid", "")
|
|
|
|
oid = spec.get("oid")
|
|
storage = StorageRef(spec.get("storageName"))
|
|
|
|
storage_path = storage.get_storage_extra_path(oid)
|
|
storage_secret = storage.get_storage_secret_name(oid)
|
|
|
|
params["storage_path"] = storage_path
|
|
params["storage_secret"] = storage_secret
|
|
params["profile_filename"] = spec.get("profileFilename", "")
|
|
params["crawler_image"] = spec["crawlerImage"]
|
|
|
|
proxy_id = spec.get("proxyId")
|
|
if proxy_id:
|
|
proxy = self.crawl_config_ops.get_crawler_proxy(proxy_id)
|
|
if proxy:
|
|
params["proxy_id"] = proxy_id
|
|
params["proxy_url"] = proxy.url
|
|
params["proxy_ssh_private_key"] = proxy.has_private_key
|
|
params["proxy_ssh_host_public_key"] = proxy.has_host_public_key
|
|
|
|
params["url"] = spec.get("startUrl", "about:blank")
|
|
params["vnc_password"] = spec.get("vncPassword")
|
|
|
|
children = self.load_from_yaml("profilebrowser.yaml", params)
|
|
|
|
return {"status": {}, "children": children}
|