Fixes #2259 This PR brings backend and frontend support for the new autoclick behavior in Browsertrix, introduces in Browsertrix 1.5.0+ On the backend, we introduce `min_autoclick_crawler_image` to `values.yaml`, with a default value of `"docker.io/webrecorder/browsertrix-crawler:1.5.0"`. If this is set and the crawler version for a new crawl is less than this value, the autoclick behavior is removed from the behaviors list in the configmap created for the crawl. The one caveat for this is that a crawler image tag like "latest" will always be parsed as greater than `min_autoclick_crawler_image`, so there is the potential for the crawler to run into issues if using a non-numeric image tag with an older version of the crawler. For production we use hardcoded specific versions of the crawler except for the dev channel, which from here on out will including autoclick support, so I think this should be okay (and is also true of the existing implementation for checking `min_qa_crawler_image`). On the frontend, I've added a checkbox (unchecked by default) in the "Limits" section just below the current checkbox for autoscroll. We might want to move these to a different section eventually - I'm not sure Limits is the right place for them - but I wanted to be consistent with things as they are. --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
497 lines
14 KiB
YAML
497 lines
14 KiB
YAML
# Global Settings
|
|
# =========================================
|
|
|
|
# locales available to choose from in the UI
|
|
# if not set, all locales available by default
|
|
# ex: enable only 'en' and 'es' locales
|
|
# locales_enabled: "en,es"
|
|
|
|
|
|
# Crawler Settings
|
|
# =========================================
|
|
|
|
# default time to run behaviors on each page (in seconds)
|
|
default_behavior_time_seconds: 300
|
|
|
|
# default time to wait for page to fully load before running behaviors (in seconds)
|
|
default_page_load_time_seconds: 120
|
|
|
|
# disk utilization threshold percentage - when used disk space passes
|
|
# this percentage of total, crawls will gracefully stop to prevent the
|
|
# disk from being filled
|
|
# This should be a string so that it can be included in crawler_args
|
|
disk_utilization_threshold: 90
|
|
|
|
# crawler logging flags
|
|
crawler_logging_opts: "stats,behaviors,debug"
|
|
|
|
# to enable, set to one or more comma separate values: to-warc,to-pages,final-to-warc
|
|
crawler_extract_full_text: to-warc
|
|
|
|
# max pages per crawl
|
|
# set to non-zero value to enforce global max pages per crawl limit
|
|
# if 0, there is no page limit (may need to adjust crawler/redis settings for larger crawls)
|
|
# if set, each workflow can have a lower limit, but not higher
|
|
max_pages_per_crawl: 50000
|
|
|
|
|
|
# default template for generate wacz files
|
|
# supports following interpolated vars:
|
|
# @ts - current timestamp
|
|
# @hostname - full hostname
|
|
# @hostsuffix - last 14-characters of hostname
|
|
# @id - full crawl id
|
|
default_crawl_filename_template: "@ts-@hostsuffix.wacz"
|
|
|
|
|
|
# advanced: additional args to be passed to the crawler
|
|
# this is mostly for testing of new/experimental crawler flags
|
|
# standard crawler options are covered with other options above
|
|
crawler_extra_args: ""
|
|
|
|
|
|
# max allowed crawl scale per crawl
|
|
max_crawl_scale: 3
|
|
|
|
|
|
# Cluster Settings
|
|
# =========================================
|
|
name: browsertrix-cloud
|
|
|
|
# when running in the cloud, set this value to cloud-specific block storage
|
|
# keep empty to use hostPath (eg. on minikube)
|
|
volume_storage_class:
|
|
|
|
# if set, set the node selector 'nodeType' for deployment pods
|
|
# main_node_type:
|
|
|
|
# if set, set the node selector 'nodeType' to this crawling pods
|
|
# crawler_node_type:
|
|
|
|
# if set to "1", enables open registration
|
|
registration_enabled: "0"
|
|
|
|
# if set, along with 'registration_enabled', will add registered users to this org
|
|
# registration_org_id: ""
|
|
|
|
jwt_token_lifetime_minutes: 1440
|
|
|
|
# if set to "1", allow inviting same user to same org multiple times
|
|
allow_dupe_invites: "0"
|
|
|
|
# number of seconds before pending invites expire - default is 7 days
|
|
invite_expire_seconds: 604800
|
|
|
|
# base url for replayweb.page
|
|
rwp_base_url: "https://cdn.jsdelivr.net/npm/replaywebpage@2.2.4/"
|
|
|
|
superuser:
|
|
# set this to enable a superuser admin
|
|
email: admin@example.com
|
|
|
|
# optional: if not set, automatically generated
|
|
# change or remove this
|
|
password: PASSW0RD!
|
|
|
|
# Set name for default organization created with superuser
|
|
default_org: "My Organization"
|
|
|
|
# Set number of days replica file deletion should be delayed by
|
|
# if set >0, will keep replicas (if any) for this number of days
|
|
replica_deletion_delay_days: 0
|
|
|
|
|
|
# API Image
|
|
# =========================================
|
|
backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.0"
|
|
backend_pull_policy: "Always"
|
|
|
|
backend_password_secret: "PASSWORD!"
|
|
|
|
# number of workers per pod
|
|
backend_workers: 1
|
|
|
|
backend_cpu: "100m"
|
|
|
|
backend_memory: "350Mi"
|
|
|
|
# port for operator service
|
|
opPort: 8756
|
|
|
|
job_cpu: "3m"
|
|
job_memory: "70Mi"
|
|
|
|
profile_browser_idle_seconds: 60
|
|
|
|
# set to true to enable subscriptions API and Billing tab
|
|
billing_enabled: false
|
|
|
|
# set URL to external sign-up page
|
|
# the internal sign-up page will take precedence if
|
|
# `registration_enabled` is set to `"1"``
|
|
sign_up_url: ""
|
|
|
|
# set e-mail to show for subscriptions related info
|
|
sales_email: ""
|
|
|
|
|
|
# survey e-mail
|
|
# if set, subscription cancellation e-mails will include a link to this survey
|
|
user_survey_url: ""
|
|
|
|
# if set, print last 'log_failed_crawl_lines' of each failed
|
|
# crawl pod to backend operator stdout
|
|
# mostly intended for debugging / testing
|
|
# log_failed_crawl_lines: 200
|
|
|
|
# Autoscale
|
|
# ---------
|
|
# max number of backend pods to scale to
|
|
# if > 1, will enable HPA for backend
|
|
backend_max_replicas: 1
|
|
|
|
# scale up if avg cpu utilization exceeds
|
|
backend_avg_cpu_threshold: 80
|
|
|
|
# scale up if avg memory utilization exceeds
|
|
backend_avg_memory_threshold: 95
|
|
|
|
# Nginx Image
|
|
# =========================================
|
|
frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.0"
|
|
frontend_pull_policy: "Always"
|
|
|
|
frontend_cpu: "10m"
|
|
|
|
frontend_memory: "64Mi"
|
|
|
|
# if set, maps nginx to a fixed port on host machine
|
|
# must be between 30000 - 32767
|
|
# use for deployments on localhost when not using ingress
|
|
# if using ingress, this value is ignored
|
|
local_service_port: 30870
|
|
|
|
frontend_alias: "http://browsertrix-cloud-frontend"
|
|
|
|
# custom URL for where Browsertrix docs are hosted
|
|
# by default, docs are served from /docs/ but can be served from a custom
|
|
# URL specified here.
|
|
# docs_url: "https://browsertrix-docs.example.com/"
|
|
|
|
# Autoscaling
|
|
# -----------
|
|
# max number of backend pods to scale to
|
|
# if > 1, will enable HPA for frontend
|
|
frontend_max_replicas: 1
|
|
|
|
# scale up if avg cpu utilization exceeds
|
|
frontend_avg_cpu_threshold: 80
|
|
|
|
# scale up if avg memory utilization exceeds
|
|
frontend_avg_memory_threshold: 95
|
|
|
|
|
|
# MongoDB Image
|
|
# =========================================
|
|
mongo_local: true
|
|
|
|
mongo_host: "local-mongo.default"
|
|
|
|
mongo_image: "docker.io/library/mongo:6.0.5"
|
|
mongo_pull_policy: "IfNotPresent"
|
|
|
|
mongo_cpu: "12m"
|
|
|
|
mongo_memory: "512Mi"
|
|
|
|
|
|
mongo_auth:
|
|
# specify either username + password (for local mongo)
|
|
username: root
|
|
password: PASSWORD!
|
|
|
|
# or full URL (for remote mongo server)
|
|
# db_url: mongodb+srv://...
|
|
|
|
|
|
# Redis Image
|
|
# =========================================
|
|
redis_local: true
|
|
|
|
redis_image: "redis"
|
|
redis_pull_policy: "IfNotPresent"
|
|
|
|
redis_url: "redis://local-redis.default:6379/1"
|
|
|
|
redis_cpu: "10m"
|
|
|
|
redis_memory: "200Mi"
|
|
|
|
redis_storage: "3Gi"
|
|
|
|
|
|
# Crawler Channels
|
|
# =========================================
|
|
# Support for additional crawler release channels
|
|
# If more than one channel provided, a dropdown will be shown to users
|
|
# 'default' channel must always be included
|
|
crawler_channels:
|
|
- id: default
|
|
image: "docker.io/webrecorder/browsertrix-crawler:latest"
|
|
|
|
# Add, remove, or edit additional crawler versions below, for example:
|
|
# - id: custom_version
|
|
# image: "<DOCKER IMAGE>"
|
|
|
|
crawler_pull_policy: "Always"
|
|
|
|
crawler_namespace: "crawlers"
|
|
|
|
# if set, will restrict QA to image names that are >= than this value
|
|
# min_qa_crawler_image: ""
|
|
|
|
# if set, will restrict autoclick behavior to image names that are >= than this value
|
|
min_autoclick_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.5.0"
|
|
|
|
# optional: enable to use a persist volume claim for all crawls
|
|
# can be enabled to use a multi-write shared filesystem
|
|
# crawler_pv_claim: "nfs-shared-crawls"
|
|
|
|
# num retries
|
|
crawl_retries: 1000
|
|
|
|
# Crawler Resources
|
|
# -----------------
|
|
|
|
# base cpu for for 1 browser
|
|
crawler_cpu_base: 900m
|
|
|
|
# base memory per for 1 browser
|
|
crawler_memory_base: 1024Mi
|
|
|
|
# number of browser workers per crawler instances
|
|
crawler_browser_instances: 2
|
|
|
|
# number of browser workers per crawler instances for QA runs
|
|
# defaults to 'crawler_browser_instances' if not set
|
|
# qa_browser_instances: 2
|
|
|
|
# fixed scale (number of crawler pods) for QA runs
|
|
qa_scale: 1
|
|
|
|
# this value is added to crawler_cpu_base, for each additional browser
|
|
# crawler_cpu = crawler_cpu_base + crawler_pu_per_extra_browser * (crawler_browser_instances - 1)
|
|
crawler_extra_cpu_per_browser: 600m
|
|
|
|
crawler_extra_memory_per_browser: 768Mi
|
|
|
|
# if not set, defaults to the following, but can be overridden directly:
|
|
# crawler_cpu = crawler_cpu_base + crawler_cpu_per_extra_browser * (crawler_browser_instances - 1)
|
|
# crawler_cpu:
|
|
|
|
# if not set, defaults to the following, but can be overridden directly:
|
|
# crawler_memory = crawler_memory_base + crawler_memory_per_extra_browser * (crawler_browser_instances - 1)
|
|
# crawler_memory:
|
|
|
|
# Crawler Autoscaling
|
|
# ---------------------
|
|
|
|
# if set to true, automatically adjust crawler memory usage up to max_crawler_memory
|
|
enable_auto_resize_crawlers: false
|
|
|
|
|
|
# max crawler memory, if set, will enable auto-resizing of crawler pods up to this size
|
|
# if not set, no auto-resizing is done, and crawls always use 'crawler_memory' memory
|
|
# max_crawler_memory:
|
|
|
|
# optional: defaults to crawler_memory_base and crawler_cpu_base if not set
|
|
# profile_browser_memory:
|
|
#
|
|
# profile_browser_cpu:
|
|
|
|
# optional: set the workdir size for the profilebrowser pods
|
|
# the workdir is used to store the browser profile data and other temporary files
|
|
# profile_browser_workdir_size: 4Gi
|
|
|
|
|
|
# Other Crawler Settings
|
|
# ----------------------
|
|
|
|
# minimum size allocated to each crawler
|
|
# should be at least double crawl session size to ensure space for WACZ and browser profile data
|
|
crawler_storage: "25Gi"
|
|
|
|
|
|
# if set, will ensure 'crawler_storage' is at least this times used storage
|
|
# eg. if crawler session reaches 10Gb, and this value is 2.5, will attempt
|
|
# to resize to at least 25Gb.
|
|
crawler_min_avail_storage_ratio: 2.5
|
|
|
|
# max size at which crawler will commit current crawl session
|
|
crawler_session_size_limit_bytes: "10000000000"
|
|
|
|
# max time in seconds after which crawler will restart, if set
|
|
crawler_session_time_limit_seconds: 18000
|
|
|
|
crawler_liveness_port: 6065
|
|
|
|
# optional: use this proxy by default, when no other proxy is set for the crawl
|
|
# must match one of the proxy ids in the 'btrix-proxies.proxies' list
|
|
# will set the proxy to shared
|
|
# default_proxy: "proxy-id"
|
|
|
|
# optional: enable the proxies subchart and configure a list of ssh servers to be used as crawler proxies
|
|
btrix-proxies:
|
|
enabled: false # enable to deploy proxies configmap and secret
|
|
crawler_namespace: "crawlers"
|
|
proxies: []
|
|
# - id: proxy-id # name of the proxy, is shown in the dropdown, has to be lowercase, alphanumeric, can contain dashes
|
|
# url: # proxy connection string, must be a ssh://, socks:// or http:// URL
|
|
# label: "US Proxy" # label to show in dropdown
|
|
# country_code: US # Alpha-2 ISO 3991 country code, https://www.iso.org/obp/ui/#search
|
|
# description: "Proxy" # optional: description to show for the proxy
|
|
# shared: false # optional: set to true, to make proxy availble for all orgs
|
|
# ssh_private_key: | # requred for ssh:// proxies
|
|
# # ssh-key needed to connect to the SSH server
|
|
# <secret key>
|
|
#
|
|
# ssh_host_public_key: | # optional, for ssh:// proxies-only
|
|
# # ssh public keys of the SSH server
|
|
# # use output of `ssh-keyscan $hostname -p $port` for best results
|
|
# example.invalid:22 SSH-2.0-OpenSSH_9.6p1 Ubuntu-3ubuntu13
|
|
# example.invalid ssh-rsa AAA[..]
|
|
|
|
# optional: set the uid, gid and fsgroup for the crawler and profilebrowser pods
|
|
# the following values are used by default:
|
|
# crawler_uid: 201407
|
|
# crawler_gid: 201407
|
|
# crawler_fsgroup: 201407
|
|
|
|
|
|
# optional: enable/disable crawler network policy
|
|
crawler_enable_network_policy: true
|
|
|
|
# optional: replace the default crawler egress policy with your own
|
|
# see chart/templates/networkpolicies.yaml for an example
|
|
# crawler_network_policy_egress: {}
|
|
|
|
# time to wait for graceful stop
|
|
grace_period: 1000
|
|
|
|
|
|
# Local Minio Pod (optional)
|
|
# =========================================
|
|
# set to true to use a local minio image
|
|
minio_local: true
|
|
|
|
# enable to allow access to minio console via specified port
|
|
# minio_local_console_port: 30091
|
|
|
|
minio_scheme: "http"
|
|
minio_host: "local-minio.default:9000"
|
|
|
|
minio_image: docker.io/minio/minio:RELEASE.2022-10-24T18-35-07Z
|
|
minio_mc_image: minio/mc
|
|
minio_pull_policy: "IfNotPresent"
|
|
|
|
minio_local_bucket_name: &local_bucket_name "btrix-data"
|
|
|
|
minio_cpu: "10m"
|
|
minio_memory: "1024Mi"
|
|
|
|
|
|
# Storage
|
|
# =========================================
|
|
# should include the local minio bucket, if enabled, and any other available buckets for default storage
|
|
|
|
storages:
|
|
- name: "default"
|
|
type: "s3"
|
|
access_key: "ADMIN"
|
|
secret_key: "PASSW0RD"
|
|
bucket_name: *local_bucket_name
|
|
|
|
endpoint_url: "http://local-minio.default:9000/"
|
|
access_endpoint_url: "/data/"
|
|
|
|
|
|
# optional: duration in minutes for WACZ download links to be valid
|
|
# used by webhooks and replay
|
|
# max value = 10079 (one week minus one minute)
|
|
# storage_presign_duration_minutes: 10079
|
|
|
|
|
|
# Email Options
|
|
# =========================================
|
|
email:
|
|
# email sending is enabled when 'smtp_host' is set to non-empty value
|
|
#ex: smtp_host: smtp.gmail.com
|
|
smtp_host: ""
|
|
smtp_port: 587
|
|
sender_email: example@example.com
|
|
password: password
|
|
reply_to_email: example@example.com
|
|
use_tls: True
|
|
|
|
# if True, will print contents of all emails sent to stdout log
|
|
log_sent_emails: False
|
|
|
|
# use to provide an additional support email in email templates
|
|
support_email: ""
|
|
|
|
|
|
# Deployment options
|
|
# =========================================
|
|
|
|
# Ingress (Optional)
|
|
# Optional: if 'host' is set, a publicly accessible Ingress controller is created with an SSL cert (using letsencrypt)
|
|
ingress:
|
|
#host: ""
|
|
cert_email: "test@example.com"
|
|
tls: false
|
|
|
|
# If set, will use the old 'kubernetes.io/ingress.class' annotation instead of the new ingressClassName
|
|
# also uses old http01.ingress.class in cert-manager instead of http01.ingress.ingressClassName
|
|
# provided for backwards compatibility
|
|
useOldClassAnnotation: false
|
|
|
|
# Optional: Uncomment to use your own cluster-issuer instead of default ACME https validation
|
|
# custom_cluster_issuer: custom_cluster_issuer-name
|
|
|
|
ingress_class: nginx
|
|
|
|
# Optional: Front-end injected script
|
|
# This runs as a blocking script on the frontend, so usually you'll want to have it just add a single script tag to the page with the `defer` attribute. Useful for things like analytics and bug tracking.
|
|
# inject_extra: // your front-end injected script
|
|
|
|
|
|
# Signing Options
|
|
# =========================================
|
|
# optionally enable signer
|
|
signer:
|
|
enabled: false
|
|
image: webrecorder/authsign:0.5.2
|
|
# host: <set to signer domain>
|
|
# cert_email: "test@example.com
|
|
# image_pull_policy: "IfNotPresent"
|
|
# auth_token: <set to custom value>
|
|
|
|
signer_cpu: "5m"
|
|
|
|
signer_memory: "50Mi"
|
|
|
|
|
|
# Optional: configure load balancing annotations
|
|
# service:
|
|
# annotations:
|
|
# service.beta.kubernetes.io/aws-load-balancer-internal: "true"
|
|
# helm.sh/resource-policy: keep
|
|
|
|
# Admin services (see Chart.yaml's dependencies)
|
|
# note: see `chart/examples/local-logging.yaml`
|
|
addons:
|
|
admin:
|
|
logging: false
|
|
|
|
# metacontroller:
|