# Global Settings # ========================================= # locales available to choose from in the UI # if not set, all locales available by default # ex: enable only 'en' and 'es' locales # locales_enabled: "en,es" # Crawler Settings # ========================================= # default time to run behaviors on each page (in seconds) default_behavior_time_seconds: 300 # default time to wait for page to fully load before running behaviors (in seconds) default_page_load_time_seconds: 120 # disk utilization threshold percentage - when used disk space passes # this percentage of total, crawls will gracefully stop to prevent the # disk from being filled # This should be a string so that it can be included in crawler_args disk_utilization_threshold: 90 # crawler logging flags crawler_logging_opts: "stats,behaviors,debug" # to enable, set to one or more comma separate values: to-warc,to-pages,final-to-warc crawler_extract_full_text: to-warc # max pages per crawl # set to non-zero value to enforce global max pages per crawl limit # if 0, there is no page limit (may need to adjust crawler/redis settings for larger crawls) # if set, each workflow can have a lower limit, but not higher max_pages_per_crawl: 50000 # default template for generate wacz files # supports following interpolated vars: # @ts - current timestamp # @hostname - full hostname # @hostsuffix - last 14-characters of hostname # @id - full crawl id default_crawl_filename_template: "@ts-@hostsuffix.wacz" # advanced: additional args to be passed to the crawler # this is mostly for testing of new/experimental crawler flags # standard crawler options are covered with other options above crawler_extra_args: "" # max allowed crawl scale per crawl max_crawl_scale: 3 # Cluster Settings # ========================================= name: browsertrix-cloud # when running in the cloud, set this value to cloud-specific block storage # keep empty to use hostPath (eg. on minikube) volume_storage_class: # if set, set the node selector 'nodeType' for deployment pods # main_node_type: # if set, set the node selector 'nodeType' to this crawling pods # crawler_node_type: # if set to "1", enables open registration registration_enabled: "0" # if set, along with 'registration_enabled', will add registered users to this org # registration_org_id: "" jwt_token_lifetime_minutes: 1440 # if set to "1", allow inviting same user to same org multiple times allow_dupe_invites: "0" # number of seconds before pending invites expire - default is 7 days invite_expire_seconds: 604800 # base url for replayweb.page rwp_base_url: "https://cdn.jsdelivr.net/npm/replaywebpage@2.2.4/" superuser: # set this to enable a superuser admin email: admin@example.com # optional: if not set, automatically generated # change or remove this password: PASSW0RD! # Set name for default organization created with superuser default_org: "My Organization" # Set number of days replica file deletion should be delayed by # if set >0, will keep replicas (if any) for this number of days replica_deletion_delay_days: 0 # API Image # ========================================= backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.7" backend_pull_policy: "Always" backend_password_secret: "PASSWORD!" # number of workers per pod backend_workers: 1 # for gunicorn --timeout backend_worker_timeout: 60 backend_cpu: "100m" backend_memory: "350Mi" # port for operator service opPort: 8756 job_cpu: "3m" job_memory: "70Mi" profile_browser_idle_seconds: 60 # set to true to enable subscriptions API and Billing tab billing_enabled: false # set URL to external sign-up page # the internal sign-up page will take precedence if # `registration_enabled` is set to `"1"`` sign_up_url: "" # set e-mail to show for subscriptions related info sales_email: "" # survey e-mail # if set, subscription cancellation e-mails will include a link to this survey user_survey_url: "" # if set, print last 'log_failed_crawl_lines' of each failed # crawl pod to backend operator stdout # mostly intended for debugging / testing # log_failed_crawl_lines: 200 # Autoscale # --------- # max number of backend pods to scale to # if > 1, will enable HPA for backend backend_max_replicas: 1 # scale up if avg cpu utilization exceeds backend_avg_cpu_threshold: 80 # scale up if avg memory utilization exceeds backend_avg_memory_threshold: 95 # Nginx Image # ========================================= frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.7" frontend_pull_policy: "Always" frontend_cpu: "10m" frontend_memory: "64Mi" # if set, maps nginx to a fixed port on host machine # must be between 30000 - 32767 # use for deployments on localhost when not using ingress # if using ingress, this value is ignored local_service_port: 30870 frontend_alias: "http://browsertrix-cloud-frontend" # custom URL for where Browsertrix docs are hosted # by default, docs are served from /docs/ but can be served from a custom # URL specified here. # docs_url: "https://browsertrix-docs.example.com/" # Autoscaling # ----------- # max number of backend pods to scale to # if > 1, will enable HPA for frontend frontend_max_replicas: 1 # scale up if avg cpu utilization exceeds frontend_avg_cpu_threshold: 80 # scale up if avg memory utilization exceeds frontend_avg_memory_threshold: 95 # MongoDB Image # ========================================= mongo_local: true mongo_host: "local-mongo.default" mongo_image: "docker.io/library/mongo:6.0.5" mongo_pull_policy: "IfNotPresent" mongo_cpu: "12m" mongo_memory: "512Mi" mongo_auth: # specify either username + password (for local mongo) username: root password: PASSWORD! # or full URL (for remote mongo server) # db_url: mongodb+srv://... # Redis Image # ========================================= redis_local: true redis_image: "redis" redis_pull_policy: "IfNotPresent" redis_url: "redis://local-redis.default:6379/1" redis_cpu: "10m" redis_memory: "200Mi" redis_storage: "3Gi" # Crawler Channels # ========================================= # Support for additional crawler release channels # If more than one channel provided, a dropdown will be shown to users # 'default' channel must always be included crawler_channels: - id: default image: "docker.io/webrecorder/browsertrix-crawler:latest" # Add, remove, or edit additional crawler versions below, for example: # - id: custom_version # image: "" crawler_pull_policy: "Always" crawler_namespace: "crawlers" # if set, will restrict QA to image names that are >= than this value # min_qa_crawler_image: "" # if set, will restrict autoclick behavior to image names that are >= than this value min_autoclick_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.5.0" # optional: enable to use a persist volume claim for all crawls # can be enabled to use a multi-write shared filesystem # crawler_pv_claim: "nfs-shared-crawls" # num retries crawl_retries: 1000 # Crawler Resources # ----------------- # base cpu for for 1 browser crawler_cpu_base: 900m # base memory per for 1 browser crawler_memory_base: 1024Mi # number of browser workers per crawler instances crawler_browser_instances: 2 # number of browser workers per crawler instances for QA runs # defaults to 'crawler_browser_instances' if not set # qa_browser_instances: 2 # fixed scale (number of crawler pods) for QA runs qa_scale: 1 # this value is added to crawler_cpu_base, for each additional browser # crawler_cpu = crawler_cpu_base + crawler_pu_per_extra_browser * (crawler_browser_instances - 1) crawler_extra_cpu_per_browser: 600m crawler_extra_memory_per_browser: 768Mi # if not set, defaults to the following, but can be overridden directly: # crawler_cpu = crawler_cpu_base + crawler_cpu_per_extra_browser * (crawler_browser_instances - 1) # crawler_cpu: # if not set, defaults to the following, but can be overridden directly: # crawler_memory = crawler_memory_base + crawler_memory_per_extra_browser * (crawler_browser_instances - 1) # crawler_memory: # Crawler Autoscaling # --------------------- # if set to true, automatically adjust crawler memory usage up to max_crawler_memory enable_auto_resize_crawlers: false # max crawler memory, if set, will enable auto-resizing of crawler pods up to this size # if not set, no auto-resizing is done, and crawls always use 'crawler_memory' memory # max_crawler_memory: # optional: defaults to crawler_memory_base and crawler_cpu_base if not set # profile_browser_memory: # # profile_browser_cpu: # optional: set the workdir size for the profilebrowser pods # the workdir is used to store the browser profile data and other temporary files # profile_browser_workdir_size: 4Gi # Other Crawler Settings # ---------------------- # minimum size allocated to each crawler # should be at least double crawl session size to ensure space for WACZ and browser profile data crawler_storage: "25Gi" # if set, will ensure 'crawler_storage' is at least this times used storage # eg. if crawler session reaches 10Gb, and this value is 2.5, will attempt # to resize to at least 25Gb. crawler_min_avail_storage_ratio: 2.5 # max size at which crawler will commit current crawl session crawler_session_size_limit_bytes: "10000000000" # max time in seconds after which crawler will restart, if set crawler_session_time_limit_seconds: 18000 crawler_liveness_port: 6065 # optional: use this proxy by default, when no other proxy is set for the crawl # must match one of the proxy ids in the 'btrix-proxies.proxies' list # will set the proxy to shared # default_proxy: "proxy-id" # optional: enable the proxies subchart and configure a list of ssh servers to be used as crawler proxies btrix-proxies: enabled: false # enable to deploy proxies configmap and secret crawler_namespace: "crawlers" proxies: [] # - id: proxy-id # name of the proxy, is shown in the dropdown, has to be lowercase, alphanumeric, can contain dashes # url: # proxy connection string, must be a ssh://, socks:// or http:// URL # label: "US Proxy" # label to show in dropdown # country_code: US # Alpha-2 ISO 3991 country code, https://www.iso.org/obp/ui/#search # description: "Proxy" # optional: description to show for the proxy # shared: false # optional: set to true, to make proxy availble for all orgs # ssh_private_key: | # requred for ssh:// proxies # # ssh-key needed to connect to the SSH server # # # ssh_host_public_key: | # optional, for ssh:// proxies-only # # ssh public keys of the SSH server # # use output of `ssh-keyscan $hostname -p $port` for best results # example.invalid:22 SSH-2.0-OpenSSH_9.6p1 Ubuntu-3ubuntu13 # example.invalid ssh-rsa AAA[..] # optional: set the uid, gid and fsgroup for the crawler and profilebrowser pods # the following values are used by default: # crawler_uid: 201407 # crawler_gid: 201407 # crawler_fsgroup: 201407 # optional: enable/disable crawler network policy crawler_enable_network_policy: true # optional: replace the default crawler egress policy with your own # see chart/templates/networkpolicies.yaml for an example # crawler_network_policy_egress: {} # time to wait for graceful stop grace_period: 1000 # Local Minio Pod (optional) # ========================================= # set to true to use a local minio image minio_local: true # enable to allow access to minio console via specified port # minio_local_console_port: 30091 minio_scheme: "http" minio_host: "local-minio.default:9000" minio_image: docker.io/minio/minio:RELEASE.2022-10-24T18-35-07Z minio_mc_image: minio/mc minio_pull_policy: "IfNotPresent" minio_local_bucket_name: &local_bucket_name "btrix-data" minio_cpu: "10m" minio_memory: "1024Mi" # Storage # ========================================= # should include the local minio bucket, if enabled, and any other available buckets for default storage storages: - name: "default" type: "s3" access_key: "ADMIN" secret_key: "PASSW0RD" bucket_name: *local_bucket_name endpoint_url: "http://local-minio.default:9000/" access_endpoint_url: "/data/" # optional: duration in minutes for WACZ download links to be valid # used by webhooks and replay # max value = 10079 (one week minus one minute) # storage_presign_duration_minutes: 10079 # Email Options # ========================================= email: # email sending is enabled when 'smtp_host' is set to non-empty value #ex: smtp_host: smtp.gmail.com smtp_host: "" smtp_port: 587 sender_email: example@example.com password: password reply_to_email: example@example.com use_tls: True # if True, will print contents of all emails sent to stdout log log_sent_emails: False # use to provide an additional support email in email templates support_email: "" # Deployment options # ========================================= # Ingress (Optional) # Optional: if 'host' is set, a publicly accessible Ingress controller is created with an SSL cert (using letsencrypt) ingress: #host: "" cert_email: "test@example.com" tls: false # If set, will use the old 'kubernetes.io/ingress.class' annotation instead of the new ingressClassName # also uses old http01.ingress.class in cert-manager instead of http01.ingress.ingressClassName # provided for backwards compatibility useOldClassAnnotation: false # Optional: Uncomment to use your own cluster-issuer instead of default ACME https validation # custom_cluster_issuer: custom_cluster_issuer-name ingress_class: nginx # Optional: Front-end injected script # This runs as a blocking script on the frontend, so usually you'll want to have it just add a single script tag to the page with the `defer` attribute. Useful for things like analytics and bug tracking. # inject_extra: // your front-end injected script # Signing Options # ========================================= # optionally enable signer signer: enabled: false image: webrecorder/authsign:0.5.2 # host: # cert_email: "test@example.com # image_pull_policy: "IfNotPresent" # auth_token: signer_cpu: "5m" signer_memory: "50Mi" # Migration Options (Advanced) # ========================================= # enable to force rerun from specific migration # see backend/btrixcloud/migrations/ for list of available migrations # rerun_from_migration: # scale for certain migration background jobs # migration_jobs_scale: 1 # Other Settings # ========================================= # Optional: configure load balancing annotations # service: # annotations: # service.beta.kubernetes.io/aws-load-balancer-internal: "true" # helm.sh/resource-policy: keep # Admin services (see Chart.yaml's dependencies) # note: see `chart/examples/local-logging.yaml` addons: admin: logging: false # metacontroller: