browsertrix/chart/values.yaml

# Global Settings
# =========================================

# locales available to choose from in the UI
# if not set, all locales available by default
# ex: enable only 'en' and 'es' locales
# locales_enabled: "en,es"


# Crawler Settings
# =========================================

# default time to run behaviors on each page (in seconds)
default_behavior_time_seconds: 300

# default time to wait for page to fully load before running behaviors (in seconds)
default_page_load_time_seconds: 120

# disk utilization threshold percentage - when used disk space passes
# this percentage of total, crawls will gracefully stop to prevent the
# disk from being filled
# This should be a string so that it can be included in crawler_args
disk_utilization_threshold: 90

# crawler logging flags
crawler_logging_opts: "stats,behaviors,debug"

# to enable, set to one or more comma separate values: to-warc,to-pages,final-to-warc
crawler_extract_full_text: to-warc

# max pages per crawl
# set to non-zero value to enforce global max pages per crawl limit
# if 0, there is no page limit (may need to adjust crawler/redis settings for larger crawls)
# if set, each workflow can have a lower limit, but not higher
max_pages_per_crawl: 50000


# default template for generate wacz files
# supports following interpolated vars:
# @ts - current timestamp
# @hostname - full hostname
# @hostsuffix - last 14-characters of hostname
# @id - full crawl id
default_crawl_filename_template: "@ts-@hostsuffix.wacz"


# advanced: additional args to be passed to the crawler
# this is mostly for testing of new/experimental crawler flags
# standard crawler options are covered with other options above
crawler_extra_args: ""


# max allowed crawl scale per crawl
max_crawl_scale: 3


# Cluster Settings
# =========================================
name: browsertrix-cloud

# when running in the cloud, set this value to cloud-specific block storage
# keep empty to use hostPath (eg. on minikube)
volume_storage_class:

# if set, set the node selector 'nodeType' for deployment pods
# main_node_type:

# if set, set the node selector 'nodeType' to this crawling pods
# crawler_node_type:

# if set to  "1", enables open registration
registration_enabled: "0"

# if set, along with 'registration_enabled', will add registered users to this org
# registration_org_id: ""

jwt_token_lifetime_minutes: 1440

# if set to "1", allow inviting same user to same org multiple times
allow_dupe_invites: "0"

# number of seconds before pending invites expire - default is 7 days
invite_expire_seconds: 604800

# base url for replayweb.page
rwp_base_url: "https://cdn.jsdelivr.net/npm/replaywebpage@2.2.4/"

superuser:
  # set this to enable a superuser admin
  email: admin@example.com

  # optional: if not set, automatically generated
  # change or remove this
  password: PASSW0RD!

# Set name for default organization created with superuser
default_org: "My Organization"

# Set number of days replica file deletion should be delayed by
# if set >0, will keep replicas (if any) for this number of days
replica_deletion_delay_days: 0


# API Image
# =========================================
backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.0"
backend_pull_policy: "Always"

backend_password_secret: "PASSWORD!"

# number of workers per pod
backend_workers: 1

backend_cpu: "100m"

backend_memory: "350Mi"

# port for operator service
opPort: 8756

job_cpu: "3m"
job_memory: "70Mi"

profile_browser_idle_seconds: 60

# set to true to enable subscriptions API and Billing tab
billing_enabled: false

# set URL to external sign-up page
# the internal sign-up page will take precedence if
# `registration_enabled` is set to `"1"``
sign_up_url: ""

# set e-mail to show for subscriptions related info
sales_email: ""


# survey e-mail
# if set, subscription cancellation e-mails will include a link to this survey
user_survey_url: ""

# if set, print last 'log_failed_crawl_lines' of each failed
# crawl pod to backend operator stdout
# mostly intended for debugging / testing
# log_failed_crawl_lines: 200

# Autoscale
# ---------
# max number of backend pods to scale to
# if > 1, will enable HPA for backend
backend_max_replicas: 1

# scale up if avg cpu utilization exceeds
backend_avg_cpu_threshold: 80

# scale up if avg memory utilization exceeds
backend_avg_memory_threshold: 95

# Nginx Image
# =========================================
frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.0"
frontend_pull_policy: "Always"

frontend_cpu: "10m"

frontend_memory: "64Mi"

# if set, maps nginx to a fixed port on host machine
# must be between 30000 - 32767
# use for deployments on localhost when not using ingress
# if using ingress, this value is ignored
local_service_port: 30870

frontend_alias: "http://browsertrix-cloud-frontend"

# custom URL for where Browsertrix docs are hosted
# by default, docs are served from /docs/ but can be served from a custom
# URL specified here.
# docs_url: "https://browsertrix-docs.example.com/"

# Autoscaling
# -----------
# max number of backend pods to scale to
# if > 1, will enable HPA for frontend
frontend_max_replicas: 1

# scale up if avg cpu utilization exceeds
frontend_avg_cpu_threshold: 80

# scale up if avg memory utilization exceeds
frontend_avg_memory_threshold: 95


# MongoDB Image
# =========================================
mongo_local: true

mongo_host: "local-mongo.default"

mongo_image: "docker.io/library/mongo:6.0.5"
mongo_pull_policy: "IfNotPresent"

mongo_cpu: "12m"

mongo_memory: "512Mi"


mongo_auth:
  # specify either username + password (for local mongo)
  username: root
  password: PASSWORD!

  # or full URL (for remote mongo server)
  # db_url: mongodb+srv://...


# Redis Image
# =========================================
redis_local: true

redis_image: "redis"
redis_pull_policy: "IfNotPresent"

redis_url: "redis://local-redis.default:6379/1"

redis_cpu: "10m"

redis_memory: "200Mi"

redis_storage: "3Gi"


# Crawler Channels
# =========================================
# Support for additional crawler release channels
# If more than one channel provided, a dropdown will be shown to users
# 'default' channel must always be included
crawler_channels:
  - id: default
    image: "docker.io/webrecorder/browsertrix-crawler:latest"

  # Add, remove, or edit additional crawler versions below, for example:
  # - id: custom_version
  #   image: "<DOCKER IMAGE>"

crawler_pull_policy: "Always"

crawler_namespace: "crawlers"

# if set, will restrict QA to image names that are >= than this value
# min_qa_crawler_image: ""

# if set, will restrict autoclick behavior to image names that are >= than this value
min_autoclick_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.5.0"

# optional: enable to use a persist volume claim for all crawls
# can be enabled to use a multi-write shared filesystem
# crawler_pv_claim: "nfs-shared-crawls"

# num retries
crawl_retries: 1000

# Crawler Resources
# -----------------

# base cpu for for 1 browser
crawler_cpu_base: 900m

# base memory per for 1 browser
crawler_memory_base: 1024Mi

# number of browser workers per crawler instances
crawler_browser_instances: 2

# number of browser workers per crawler instances for QA runs
# defaults to 'crawler_browser_instances' if not set
# qa_browser_instances: 2

# fixed scale (number of crawler pods) for QA runs
qa_scale: 1

# this value is added to crawler_cpu_base, for each additional browser
# crawler_cpu = crawler_cpu_base + crawler_pu_per_extra_browser * (crawler_browser_instances - 1)
crawler_extra_cpu_per_browser: 600m

crawler_extra_memory_per_browser: 768Mi

# if not set, defaults to the following, but can be overridden directly:
# crawler_cpu = crawler_cpu_base + crawler_cpu_per_extra_browser * (crawler_browser_instances - 1)
# crawler_cpu:

# if not set, defaults to the following, but can be overridden directly:
# crawler_memory = crawler_memory_base + crawler_memory_per_extra_browser * (crawler_browser_instances - 1)
# crawler_memory:

# Crawler Autoscaling
# ---------------------

# if set to true, automatically adjust crawler memory usage up to max_crawler_memory
enable_auto_resize_crawlers: false


# max crawler memory, if set, will enable auto-resizing of crawler pods up to this size
# if not set, no auto-resizing is done, and crawls always use 'crawler_memory' memory
# max_crawler_memory:

# optional: defaults to crawler_memory_base and crawler_cpu_base if not set
# profile_browser_memory:
#
# profile_browser_cpu:

# optional: set the workdir size for the profilebrowser pods
# the workdir is used to store the browser profile data and other temporary files
# profile_browser_workdir_size: 4Gi


# Other Crawler Settings
# ----------------------

# minimum size allocated to each crawler
# should be at least double crawl session size to ensure space for WACZ and browser profile data
crawler_storage: "25Gi"


# if set, will ensure 'crawler_storage' is at least this times used storage
# eg. if crawler session reaches 10Gb, and this value is 2.5, will attempt
# to resize to at least 25Gb.
crawler_min_avail_storage_ratio: 2.5

# max size at which crawler will commit current crawl session
crawler_session_size_limit_bytes: "10000000000"

# max time in seconds after which crawler will restart, if set
crawler_session_time_limit_seconds: 18000

crawler_liveness_port: 6065

# optional: use this proxy by default, when no other proxy is set for the crawl
# must match one of the proxy ids in the 'btrix-proxies.proxies' list
# will set the proxy to shared
# default_proxy: "proxy-id"

# optional: enable the proxies subchart and configure a list of ssh servers to be used as crawler proxies
btrix-proxies:
  enabled: false # enable to deploy proxies configmap and secret
  crawler_namespace: "crawlers"
  proxies: []
  #  - id: proxy-id  # name of the proxy, is shown in the dropdown, has to be lowercase, alphanumeric, can contain dashes
  #    url: # proxy connection string, must be a ssh://, socks:// or http:// URL
  #    label: "US Proxy" # label to show in dropdown
  #    country_code: US # Alpha-2 ISO 3991 country code, https://www.iso.org/obp/ui/#search
  #    description: "Proxy" # optional: description to show for the  proxy
  #    shared: false # optional: set to true, to make proxy availble for all orgs
  #    ssh_private_key: |  # requred for ssh:// proxies
  #      # ssh-key needed to connect to the SSH server
  #        <secret key>
  #
  #    ssh_host_public_key: |  # optional, for ssh:// proxies-only
  #      # ssh public keys of the SSH server
  #      # use output of `ssh-keyscan $hostname -p $port` for best results
  #      example.invalid:22 SSH-2.0-OpenSSH_9.6p1 Ubuntu-3ubuntu13
  #      example.invalid ssh-rsa AAA[..]

# optional: set the uid, gid and fsgroup for the crawler and profilebrowser pods
# the following values are used by default:
# crawler_uid: 201407
# crawler_gid: 201407
# crawler_fsgroup: 201407


# optional: enable/disable crawler network policy
crawler_enable_network_policy: true

# optional: replace the default crawler egress policy with your own
# see chart/templates/networkpolicies.yaml for an example
# crawler_network_policy_egress: {}

# time to wait for graceful stop
grace_period: 1000


# Local Minio Pod (optional)
# =========================================
# set to true to use a local minio image
minio_local: true

# enable to allow access to minio console via specified port
# minio_local_console_port: 30091

minio_scheme: "http"
minio_host: "local-minio.default:9000"

minio_image: docker.io/minio/minio:RELEASE.2022-10-24T18-35-07Z
minio_mc_image: minio/mc
minio_pull_policy: "IfNotPresent"

minio_local_bucket_name: &local_bucket_name "btrix-data"

minio_cpu: "10m"
minio_memory: "1024Mi"


# Storage
# =========================================
# should include the local minio bucket, if enabled, and any other available buckets for default storage

storages:
  - name: "default"
    type: "s3"
    access_key: "ADMIN"
    secret_key: "PASSW0RD"
    bucket_name: *local_bucket_name

    endpoint_url: "http://local-minio.default:9000/"
    access_endpoint_url: "/data/"


# optional: duration in minutes for WACZ download links to be valid
# used by webhooks and replay
# max value = 10079 (one week minus one minute)
# storage_presign_duration_minutes: 10079


# Email Options
# =========================================
email:
  # email sending is enabled when 'smtp_host' is set to non-empty value
  #ex: smtp_host: smtp.gmail.com
  smtp_host: ""
  smtp_port: 587
  sender_email: example@example.com
  password: password
  reply_to_email: example@example.com
  use_tls: True

  # if True, will print contents of all emails sent to stdout log
  log_sent_emails: False

  # use to provide an additional support email in email templates
  support_email: ""


# Deployment options
# =========================================

# Ingress (Optional)
# Optional: if 'host' is set, a publicly accessible Ingress controller is created with an SSL cert (using letsencrypt)
ingress:
  #host: ""
  cert_email: "test@example.com"
  tls: false

  # If set, will use the old 'kubernetes.io/ingress.class' annotation instead of the new ingressClassName
  # also uses old http01.ingress.class in cert-manager instead of http01.ingress.ingressClassName
  # provided for backwards compatibility
  useOldClassAnnotation: false

  # Optional: Uncomment to use your own cluster-issuer instead of default ACME https validation
  # custom_cluster_issuer: custom_cluster_issuer-name

ingress_class: nginx

# Optional: Front-end injected script
# This runs as a blocking script on the frontend, so usually you'll want to have it just add a single script tag to the page with the `defer` attribute. Useful for things like analytics and bug tracking.
# inject_extra: // your front-end injected script


# Signing Options
# =========================================
# optionally enable signer
signer:
  enabled: false
  image: webrecorder/authsign:0.5.2
  # host: <set to signer domain>
  # cert_email: "test@example.com
  # image_pull_policy: "IfNotPresent"
  # auth_token: <set to custom value>

signer_cpu: "5m"

signer_memory: "50Mi"


# Optional: configure load balancing annotations
# service:
#   annotations:
#     service.beta.kubernetes.io/aws-load-balancer-internal: "true"
#     helm.sh/resource-policy: keep

# Admin services (see Chart.yaml's dependencies)
# note: see `chart/examples/local-logging.yaml`
addons:
  admin:
    logging: false

# metacontroller: