531 lines
16 KiB
YAML
531 lines
16 KiB
YAML
# Global Settings
|
|
# =========================================
|
|
|
|
# locales available to choose from in the UI
|
|
# if not set, all locales available by default
|
|
# ex: enable only 'en' and 'es' locales
|
|
# locales_enabled: "en,es"
|
|
|
|
|
|
# Crawler Settings
|
|
# =========================================
|
|
|
|
# default time to run behaviors on each page (in seconds)
|
|
default_behavior_time_seconds: 300
|
|
|
|
# default time to wait for page to fully load before running behaviors (in seconds)
|
|
default_page_load_time_seconds: 120
|
|
|
|
# disk utilization threshold percentage - when used disk space passes
|
|
# this percentage of total, crawls will gracefully stop to prevent the
|
|
# disk from being filled
|
|
# This should be a string so that it can be included in crawler_args
|
|
disk_utilization_threshold: 90
|
|
|
|
# crawler logging flags
|
|
crawler_logging_opts: "stats,behaviors,debug"
|
|
|
|
# to enable, set to one or more comma separate values: to-warc,to-pages,final-to-warc
|
|
crawler_extract_full_text: to-warc
|
|
|
|
# max pages per crawl
|
|
# set to non-zero value to enforce global max pages per crawl limit
|
|
# if 0, there is no page limit (may need to adjust crawler/redis settings for larger crawls)
|
|
# if set, each workflow can have a lower limit, but not higher
|
|
max_pages_per_crawl: 50000
|
|
|
|
|
|
# default template for generate wacz files
|
|
# supports following interpolated vars:
|
|
# @ts - current timestamp
|
|
# @hostname - full hostname
|
|
# @hostsuffix - last 14-characters of hostname
|
|
# @id - full crawl id
|
|
default_crawl_filename_template: "@ts-@hostsuffix.wacz"
|
|
|
|
|
|
# advanced: additional args to be passed to the crawler
|
|
# this is mostly for testing of new/experimental crawler flags
|
|
# standard crawler options are covered with other options above
|
|
crawler_extra_args: ""
|
|
|
|
|
|
# max allowed browser windows per crawl
|
|
max_browser_windows: 8
|
|
|
|
|
|
# Cluster Settings
|
|
# =========================================
|
|
name: browsertrix-cloud
|
|
|
|
# when running in the cloud, set this value to cloud-specific block storage
|
|
# keep empty to use hostPath (eg. on minikube)
|
|
volume_storage_class:
|
|
|
|
# if set, set the node selector 'nodeType' for deployment pods
|
|
# main_node_type:
|
|
|
|
# if set, set the node selector 'nodeType' to this crawling pods
|
|
# crawler_node_type:
|
|
|
|
# if set to "1", enables open registration
|
|
registration_enabled: "0"
|
|
|
|
# if set, along with 'registration_enabled', will add registered users to this org
|
|
# registration_org_id: ""
|
|
|
|
jwt_token_lifetime_minutes: 1440
|
|
|
|
# if set to "1", allow inviting same user to same org multiple times
|
|
allow_dupe_invites: "0"
|
|
|
|
# number of seconds before pending invites expire - default is 7 days
|
|
invite_expire_seconds: 604800
|
|
|
|
# number of minutes before paused crawls are stopped - default is 7 days
|
|
paused_crawl_limit_minutes: 10080
|
|
|
|
# base url for replayweb.page
|
|
rwp_base_url: "https://cdn.jsdelivr.net/npm/replaywebpage/"
|
|
|
|
superuser:
|
|
# set this to enable a superuser admin
|
|
email: admin@example.com
|
|
|
|
# optional: if not set, automatically generated
|
|
# change or remove this
|
|
password: PASSW0RD!
|
|
|
|
# Set name for default organization created with superuser
|
|
default_org: "My Organization"
|
|
|
|
# Set number of days replica file deletion should be delayed by
|
|
# if set >0, will keep replicas (if any) for this number of days
|
|
replica_deletion_delay_days: 0
|
|
|
|
|
|
# API Image
|
|
# =========================================
|
|
backend_image: "docker.io/webrecorder/browsertrix-backend:1.17.4"
|
|
backend_pull_policy: "IfNotPresent"
|
|
|
|
backend_password_secret: "PASSWORD!"
|
|
|
|
# number of workers per pod
|
|
backend_workers: 1
|
|
|
|
# for gunicorn --timeout
|
|
backend_worker_timeout: 60
|
|
|
|
backend_cpu: "100m"
|
|
|
|
backend_memory: "350Mi"
|
|
|
|
# port for operator service
|
|
opPort: 8756
|
|
|
|
job_cpu: "3m"
|
|
job_memory: "70Mi"
|
|
|
|
profile_browser_idle_seconds: 60
|
|
|
|
# set to true to enable subscriptions API and Billing tab
|
|
billing_enabled: false
|
|
|
|
# set URL to external sign-up page
|
|
# the internal sign-up page will take precedence if
|
|
# `registration_enabled` is set to `"1"``
|
|
sign_up_url: ""
|
|
|
|
# set e-mail to show for subscriptions related info
|
|
sales_email: ""
|
|
|
|
|
|
# survey e-mail
|
|
# if set, subscription cancellation e-mails will include a link to this survey
|
|
user_survey_url: ""
|
|
|
|
# if set, print last 'log_failed_crawl_lines' of each failed
|
|
# crawl pod to backend operator stdout
|
|
# mostly intended for debugging / testing
|
|
# log_failed_crawl_lines: 200
|
|
|
|
# Autoscale
|
|
# ---------
|
|
# max number of backend pods to scale to
|
|
# if > 1, will enable HPA for backend
|
|
backend_max_replicas: 1
|
|
|
|
# scale up if avg cpu utilization exceeds
|
|
backend_avg_cpu_threshold: 80
|
|
|
|
# scale up if avg memory utilization exceeds
|
|
backend_avg_memory_threshold: 95
|
|
|
|
# Nginx Image
|
|
# =========================================
|
|
frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.17.4"
|
|
frontend_pull_policy: "IfNotPresent"
|
|
|
|
frontend_cpu: "10m"
|
|
|
|
frontend_memory: "64Mi"
|
|
|
|
# if set, maps nginx to a fixed port on host machine
|
|
# must be between 30000 - 32767
|
|
# use for deployments on localhost when not using ingress
|
|
# if using ingress, this value is ignored
|
|
local_service_port: 30870
|
|
|
|
frontend_alias: "http://browsertrix-cloud-frontend"
|
|
|
|
# custom URL for where Browsertrix docs are hosted
|
|
# by default, docs are served from /docs/ but can be served from a custom
|
|
# URL specified here.
|
|
# docs_url: "https://browsertrix-docs.example.com/"
|
|
|
|
# Autoscaling
|
|
# -----------
|
|
# max number of backend pods to scale to
|
|
# if > 1, will enable HPA for frontend
|
|
frontend_max_replicas: 1
|
|
|
|
# scale up if avg cpu utilization exceeds
|
|
frontend_avg_cpu_threshold: 80
|
|
|
|
# scale up if avg memory utilization exceeds
|
|
frontend_avg_memory_threshold: 95
|
|
|
|
|
|
# MongoDB Image
|
|
# =========================================
|
|
mongo_local: true
|
|
|
|
mongo_host: "local-mongo"
|
|
|
|
mongo_image: "docker.io/library/mongo:6.0.5"
|
|
mongo_pull_policy: "IfNotPresent"
|
|
|
|
mongo_cpu: "12m"
|
|
|
|
mongo_memory: "512Mi"
|
|
|
|
|
|
mongo_auth:
|
|
# specify either username + password (for local mongo)
|
|
username: root
|
|
password: PASSWORD!
|
|
|
|
# or full URL (for remote mongo server)
|
|
# db_url: mongodb+srv://...
|
|
|
|
|
|
# Redis Image
|
|
# =========================================
|
|
redis_image: "redis"
|
|
redis_pull_policy: "IfNotPresent"
|
|
|
|
redis_cpu: "10m"
|
|
|
|
redis_memory: "200Mi"
|
|
|
|
redis_storage: "3Gi"
|
|
|
|
|
|
# Crawler Channels
|
|
# =========================================
|
|
# Support for additional crawler release channels
|
|
# If more than one channel provided, a dropdown will be shown to users
|
|
# 'default' channel must always be included
|
|
crawler_channels:
|
|
- id: default
|
|
image: "docker.io/webrecorder/browsertrix-crawler:latest"
|
|
imagePullPolicy: Always
|
|
|
|
# Add, remove, or edit additional crawler versions below, for example:
|
|
# - id: custom_version
|
|
# image: "<DOCKER IMAGE>"
|
|
# imagePullPolicy: Always|IfNotPresent|Never (optional, defaults to crawler_pull_policy)
|
|
|
|
# default crawler pull policy if not set per channel
|
|
crawler_pull_policy: "IfNotPresent"
|
|
|
|
crawler_namespace: "crawlers"
|
|
|
|
# if set, will restrict QA to image names that are >= than this value
|
|
# min_qa_crawler_image: ""
|
|
|
|
# if set, will restrict autoclick behavior to image names that are >= than this value
|
|
min_autoclick_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.5.0"
|
|
|
|
# optional: enable to use a persist volume claim for all crawls
|
|
# can be enabled to use a multi-write shared filesystem
|
|
# crawler_pv_claim: "nfs-shared-crawls"
|
|
|
|
# num retries
|
|
crawl_retries: 1000
|
|
|
|
# Crawler Resources
|
|
# -----------------
|
|
|
|
# base cpu for for 1 browser
|
|
crawler_cpu_base: 900m
|
|
|
|
# base memory per for 1 browser
|
|
crawler_memory_base: 1024Mi
|
|
|
|
# number of browser workers per crawler instances
|
|
crawler_browser_instances: 2
|
|
|
|
# number of browser workers per QA pod to run for QA runs
|
|
# defaults to 'crawler_browser_instances' if not set
|
|
qa_browser_instances: 1
|
|
|
|
# number of browser windows to run for QA (with 'qa_browser_instances' per pod)
|
|
qa_num_browser_windows: 2
|
|
|
|
# this value is added to crawler_cpu_base, for each additional browser
|
|
# crawler_cpu = crawler_cpu_base + crawler_pu_per_extra_browser * (crawler_browser_instances - 1)
|
|
crawler_extra_cpu_per_browser: 600m
|
|
|
|
crawler_extra_memory_per_browser: 768Mi
|
|
|
|
# if not set, defaults to the following, but can be overridden directly:
|
|
# crawler_cpu = crawler_cpu_base + crawler_cpu_per_extra_browser * (crawler_browser_instances - 1)
|
|
# crawler_cpu:
|
|
|
|
# if not set, defaults to the following, but can be overridden directly:
|
|
# crawler_memory = crawler_memory_base + crawler_memory_per_extra_browser * (crawler_browser_instances - 1)
|
|
# crawler_memory:
|
|
|
|
# Crawler Autoscaling
|
|
# ---------------------
|
|
|
|
# if set to true, automatically adjust crawler memory usage up to max_crawler_memory
|
|
enable_auto_resize_crawlers: false
|
|
|
|
|
|
# max crawler memory, if set, will enable auto-resizing of crawler pods up to this size
|
|
# if not set, no auto-resizing is done, and crawls always use 'crawler_memory' memory
|
|
# max_crawler_memory:
|
|
|
|
# optional: defaults to crawler_memory_base and crawler_cpu_base if not set
|
|
# profile_browser_memory:
|
|
#
|
|
# profile_browser_cpu:
|
|
|
|
# optional: set the workdir size for the profilebrowser pods
|
|
# the workdir is used to store the browser profile data and other temporary files
|
|
# profile_browser_workdir_size: 4Gi
|
|
|
|
|
|
# Other Crawler Settings
|
|
# ----------------------
|
|
|
|
# minimum size allocated to each crawler
|
|
# should be at least double crawl session size to ensure space for WACZ and browser profile data
|
|
crawler_storage: "25Gi"
|
|
|
|
|
|
# if set, will ensure 'crawler_storage' is at least this times used storage
|
|
# eg. if crawler session reaches 10Gb, and this value is 2.5, will attempt
|
|
# to resize to at least 25Gb.
|
|
crawler_min_avail_storage_ratio: 2.5
|
|
|
|
# max size at which crawler will commit current crawl session
|
|
crawler_session_size_limit_bytes: "10000000000"
|
|
|
|
# max time in seconds after which crawler will restart, if set
|
|
crawler_session_time_limit_seconds: 18000
|
|
|
|
crawler_liveness_port: 6065
|
|
|
|
# optional: use this proxy by default, when no other proxy is set for the crawl
|
|
# must match one of the proxy ids in the 'btrix-proxies.proxies' list
|
|
# will set the proxy to shared
|
|
# default_proxy: "proxy-id"
|
|
|
|
# optional: enable the proxies subchart and configure a list of ssh servers to be used as crawler proxies
|
|
btrix-proxies:
|
|
enabled: false # enable to deploy proxies configmap and secret
|
|
crawler_namespace: "crawlers"
|
|
proxies: []
|
|
# - id: proxy-id # name of the proxy, is shown in the dropdown, has to be lowercase, alphanumeric, can contain dashes
|
|
# url: # proxy connection string, must be a ssh://, socks:// or http:// URL
|
|
# label: "US Proxy" # label to show in dropdown
|
|
# country_code: US # Alpha-2 ISO 3991 country code, https://www.iso.org/obp/ui/#search
|
|
# description: "Proxy" # optional: description to show for the proxy
|
|
# shared: false # optional: set to true, to make proxy availble for all orgs
|
|
# ssh_private_key: | # requred for ssh:// proxies
|
|
# # ssh-key needed to connect to the SSH server
|
|
# <secret key>
|
|
#
|
|
# ssh_host_public_key: | # optional, for ssh:// proxies-only
|
|
# # ssh public keys of the SSH server
|
|
# # use output of `ssh-keyscan $hostname -p $port` for best results
|
|
# example.invalid:22 SSH-2.0-OpenSSH_9.6p1 Ubuntu-3ubuntu13
|
|
# example.invalid ssh-rsa AAA[..]
|
|
|
|
# optional: set the uid, gid and fsgroup for the crawler and profilebrowser pods
|
|
# the following values are used by default:
|
|
# crawler_uid: 201407
|
|
# crawler_gid: 201407
|
|
# crawler_fsgroup: 201407
|
|
|
|
|
|
# optional: enable/disable crawler network policy, prevents crawler pods from accessing internal services
|
|
crawler_enable_network_policy: true
|
|
|
|
# optional: add additional egress rules to the default crawler network policy (See chart/templates/networkpolicies.yaml for an example)
|
|
# crawler_network_policy_additional_egress: []
|
|
|
|
# optional: replace the default crawler egress policy with your own egress rules (See chart/templates/networkpolicies.yaml for an example)
|
|
# see chart/templates/networkpolicies.yaml for an example
|
|
# crawler_network_policy_egress: []
|
|
|
|
# time to wait for graceful stop
|
|
grace_period: 1000
|
|
|
|
|
|
# Local Minio Pod (optional)
|
|
# =========================================
|
|
# set to true to use a local minio image
|
|
minio_local: true
|
|
|
|
# enable to allow access to minio console via specified port
|
|
# minio_local_console_port: 30091
|
|
|
|
minio_scheme: "http"
|
|
minio_host: "local-minio:9000"
|
|
|
|
minio_image: docker.io/minio/minio:RELEASE.2022-10-24T18-35-07Z
|
|
minio_mc_image: minio/mc
|
|
minio_pull_policy: "IfNotPresent"
|
|
|
|
minio_local_bucket_name: &local_bucket_name "btrix-data"
|
|
|
|
# path for serving from local minio bucket
|
|
minio_access_path: &minio_access_path "/data/"
|
|
|
|
minio_cpu: "10m"
|
|
minio_memory: "1024Mi"
|
|
|
|
|
|
# Storage
|
|
# =========================================
|
|
# should include the local minio bucket, if enabled, and any other available buckets for default storage
|
|
|
|
storages:
|
|
- name: "default"
|
|
type: "s3"
|
|
access_key: "ADMIN"
|
|
secret_key: "PASSW0RD"
|
|
bucket_name: *local_bucket_name
|
|
|
|
endpoint_url: "http://local-minio:9000/"
|
|
access_endpoint_url: *minio_access_path
|
|
|
|
#access_addressing_style: 'path' or 'virtual'
|
|
# determine if bucket should be accessed as:
|
|
# - virtual - https://<bucket>.<host>/<key>
|
|
# - path - https://<host>/<bucket>/<key>
|
|
#
|
|
# if not specified, defaults to 'path' for local minio or
|
|
# 'virtual' for all other storages
|
|
|
|
|
|
# optional: duration in minutes for WACZ download links to be valid
|
|
# used by webhooks and replay
|
|
# max value = 10079 (one week minus one minute)
|
|
# storage_presign_duration_minutes: 10079
|
|
|
|
|
|
# Email Options
|
|
# =========================================
|
|
email:
|
|
# email sending is enabled when 'smtp_host' is set to non-empty value
|
|
#ex: smtp_host: smtp.gmail.com
|
|
smtp_host: ""
|
|
smtp_port: 587
|
|
sender_email: example@example.com
|
|
password: password
|
|
reply_to_email: example@example.com
|
|
use_tls: True
|
|
|
|
# if True, will print contents of all emails sent to stdout log
|
|
log_sent_emails: False
|
|
|
|
# use to provide an additional support email in email templates
|
|
support_email: ""
|
|
|
|
|
|
# Deployment options
|
|
# =========================================
|
|
|
|
# Ingress (Optional)
|
|
# Optional: if 'host' is set, a publicly accessible Ingress controller is created with an SSL cert (using letsencrypt)
|
|
ingress:
|
|
#host: ""
|
|
cert_email: "test@example.com"
|
|
tls: false
|
|
|
|
# If set, will use the old 'kubernetes.io/ingress.class' annotation instead of the new ingressClassName
|
|
# also uses old http01.ingress.class in cert-manager instead of http01.ingress.ingressClassName
|
|
# provided for backwards compatibility
|
|
useOldClassAnnotation: false
|
|
|
|
# Optional: Uncomment to use your own cluster-issuer instead of default ACME https validation
|
|
# custom_cluster_issuer: custom_cluster_issuer-name
|
|
|
|
ingress_class: nginx
|
|
|
|
# Optional: Front-end injected script
|
|
# This runs as a blocking script on the frontend, so usually you'll want to have it just add a single script tag to the page with the `defer` attribute. Useful for things like analytics and bug tracking.
|
|
# inject_extra: // your front-end injected script
|
|
|
|
# Signing Options
|
|
# =========================================
|
|
# optionally enable signer
|
|
signer:
|
|
enabled: false
|
|
image: webrecorder/authsign:0.5.2
|
|
# host: <set to signer domain>
|
|
# cert_email: "test@example.com
|
|
# image_pull_policy: "IfNotPresent"
|
|
# auth_token: <set to custom value>
|
|
|
|
signer_cpu: "5m"
|
|
|
|
signer_memory: "50Mi"
|
|
|
|
|
|
# Migration Options (Advanced)
|
|
# =========================================
|
|
|
|
# enable to force rerun from specific migration
|
|
# see backend/btrixcloud/migrations/ for list of available migrations
|
|
# rerun_from_migration:
|
|
|
|
# scale for certain migration background jobs
|
|
# migration_jobs_scale: 1
|
|
|
|
# Other Settings
|
|
# =========================================
|
|
|
|
# default FQDN suffix, shouldn't need to change
|
|
fqdn_suffix: .svc.cluster.local
|
|
|
|
# Optional: configure load balancing annotations
|
|
# service:
|
|
# annotations:
|
|
# service.beta.kubernetes.io/aws-load-balancer-internal: "true"
|
|
# helm.sh/resource-policy: keep
|
|
|
|
# Admin services (see Chart.yaml's dependencies)
|
|
# note: see `chart/examples/local-logging.yaml`
|
|
addons:
|
|
admin:
|
|
logging: false
|
|
|
|
# metacontroller:
|