396 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			396 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
 | 
						|
# Crawler Settings
 | 
						|
# =========================================
 | 
						|
 | 
						|
# default time to run behaviors on each page (in seconds)
 | 
						|
default_behavior_time_seconds: 300
 | 
						|
 | 
						|
# default time to wait for page to fully load before running behaviors (in seconds)
 | 
						|
default_page_load_time_seconds: 120
 | 
						|
 | 
						|
# disk utilization threshold percentage - when used disk space passes
 | 
						|
# this percentage of total, crawls will gracefully stop to prevent the
 | 
						|
# disk from being filled
 | 
						|
# This should be a string so that it can be included in crawler_args
 | 
						|
disk_utilization_threshold: 90
 | 
						|
 | 
						|
# crawler logging flags
 | 
						|
crawler_logging_opts: "stats,behaviors,debug"
 | 
						|
 | 
						|
# to enable, set to one or more comma separate values: to-warc,to-pages,final-to-warc
 | 
						|
crawler_extract_full_text: to-warc
 | 
						|
 | 
						|
# max pages per crawl
 | 
						|
# set to non-zero value to enforce global max pages per crawl limit
 | 
						|
# if 0, there is no page limit (may need to adjust crawler/redis settings for larger crawls)
 | 
						|
# if set, each workflow can have a lower limit, but not higher
 | 
						|
max_pages_per_crawl: 50000
 | 
						|
 | 
						|
 | 
						|
# default template for generate wacz files
 | 
						|
# supports following interpolated vars:
 | 
						|
# @ts - current timestamp
 | 
						|
# @hostname - full hostname
 | 
						|
# @hostsuffix - last 14-characters of hostname
 | 
						|
# @id - full crawl id
 | 
						|
default_crawl_filename_template: "@ts-@hostsuffix.wacz"
 | 
						|
 | 
						|
 | 
						|
# advanced: additional args to be passed to the crawler
 | 
						|
# this is mostly for testing of new/experimental crawler flags
 | 
						|
# standard crawler options are covered with other options above
 | 
						|
crawler_extra_args: ""
 | 
						|
 | 
						|
 | 
						|
# max allowed crawl scale per crawl
 | 
						|
max_crawl_scale: 3
 | 
						|
 | 
						|
 | 
						|
# Cluster Settings
 | 
						|
# =========================================
 | 
						|
name: browsertrix-cloud
 | 
						|
 | 
						|
# when running in the cloud, set this value to cloud-specific block storage
 | 
						|
# keep empty to use hostPath (eg. on minikube)
 | 
						|
volume_storage_class:
 | 
						|
 | 
						|
# if set, set the node selector 'nodeType' for deployment pods
 | 
						|
# main_node_type:
 | 
						|
 | 
						|
# if set, set the node selector 'nodeType' to this crawling pods
 | 
						|
# crawler_node_type:
 | 
						|
 | 
						|
registration_enabled: "0"
 | 
						|
 | 
						|
# if set, along with 'registration_enabled', will add registrated users to this org
 | 
						|
# registration_org_id: ""
 | 
						|
 | 
						|
jwt_token_lifetime_minutes: 1440
 | 
						|
 | 
						|
# if set to "1", allow inviting same user to same org multiple times
 | 
						|
allow_dupe_invites: "0"
 | 
						|
 | 
						|
# number of seconds before pending invites expire - default is 7 days
 | 
						|
invite_expire_seconds: 604800
 | 
						|
 | 
						|
# base url for replayweb.page
 | 
						|
rwp_base_url: "https://cdn.jsdelivr.net/npm/replaywebpage@1.8.15/"
 | 
						|
 | 
						|
superuser:
 | 
						|
  # set this to enable a superuser admin
 | 
						|
  email: admin@example.com
 | 
						|
 | 
						|
  # optional: if not set, automatically generated
 | 
						|
  # change or remove this
 | 
						|
  password: PASSW0RD!
 | 
						|
 | 
						|
# Set name for default organization created with superuser
 | 
						|
default_org: "My Organization"
 | 
						|
 | 
						|
 | 
						|
# API Image
 | 
						|
# =========================================
 | 
						|
backend_image: "docker.io/webrecorder/browsertrix-backend:1.10.0-beta.6"
 | 
						|
backend_pull_policy: "Always"
 | 
						|
 | 
						|
backend_password_secret: "PASSWORD!"
 | 
						|
 | 
						|
# number of workers per pod
 | 
						|
backend_workers: 1
 | 
						|
 | 
						|
backend_cpu: "100m"
 | 
						|
 | 
						|
backend_memory: "350Mi"
 | 
						|
 | 
						|
# port for operator service
 | 
						|
opPort: 8756
 | 
						|
 | 
						|
job_cpu: "3m"
 | 
						|
job_memory: "70Mi"
 | 
						|
 | 
						|
profile_browser_idle_seconds: 60
 | 
						|
 | 
						|
# if set, print last 'log_failed_crawl_lines' of each failed
 | 
						|
# crawl pod to backend operator stdout
 | 
						|
# mostly intended for debugging / testing
 | 
						|
# log_failed_crawl_lines: 200
 | 
						|
 | 
						|
# Autoscale
 | 
						|
# ---------
 | 
						|
# max number of backend pods to scale to
 | 
						|
# if > 1, will enable HPA for backend
 | 
						|
backend_max_replicas: 1
 | 
						|
 | 
						|
# scale up if avg cpu utilization exceeds
 | 
						|
backend_avg_cpu_threshold: 80
 | 
						|
 | 
						|
# scale up if avg memory utilization exceeds
 | 
						|
backend_avg_memory_threshold: 95
 | 
						|
 | 
						|
# Nginx Image
 | 
						|
# =========================================
 | 
						|
frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.10.0-beta.6"
 | 
						|
frontend_pull_policy: "Always"
 | 
						|
 | 
						|
frontend_cpu: "10m"
 | 
						|
 | 
						|
frontend_memory: "64Mi"
 | 
						|
 | 
						|
# if set, maps nginx to a fixed port on host machine
 | 
						|
# must be between 30000 - 32767
 | 
						|
# use for deployments on localhost when not using ingress
 | 
						|
# if using ingress, this value is ignored
 | 
						|
local_service_port: 30870
 | 
						|
 | 
						|
frontend_alias: "http://browsertrix-cloud-frontend"
 | 
						|
 | 
						|
# Autoscaling
 | 
						|
# -----------
 | 
						|
# max number of backend pods to scale to
 | 
						|
# if > 1, will enable HPA for frontend
 | 
						|
frontend_max_replicas: 1
 | 
						|
 | 
						|
# scale up if avg cpu utilization exceeds
 | 
						|
frontend_avg_cpu_threshold: 80
 | 
						|
 | 
						|
# scale up if avg memory utilization exceeds
 | 
						|
frontend_avg_memory_threshold: 95
 | 
						|
 | 
						|
 | 
						|
# MongoDB Image
 | 
						|
# =========================================
 | 
						|
mongo_local: true
 | 
						|
 | 
						|
mongo_host: "local-mongo.default"
 | 
						|
 | 
						|
mongo_image: "docker.io/library/mongo:6.0.5"
 | 
						|
mongo_pull_policy: "IfNotPresent"
 | 
						|
 | 
						|
mongo_cpu: "12m"
 | 
						|
 | 
						|
mongo_memory: "512Mi"
 | 
						|
 | 
						|
 | 
						|
mongo_auth:
 | 
						|
  # specify either username + password (for local mongo)
 | 
						|
  username: root
 | 
						|
  password: PASSWORD!
 | 
						|
 | 
						|
  # or full URL (for remote mongo server)
 | 
						|
  # db_url: mongodb+srv://...
 | 
						|
 | 
						|
 | 
						|
# Redis Image
 | 
						|
# =========================================
 | 
						|
redis_local: true
 | 
						|
 | 
						|
redis_image: "redis"
 | 
						|
redis_pull_policy: "IfNotPresent"
 | 
						|
 | 
						|
redis_url: "redis://local-redis.default:6379/1"
 | 
						|
 | 
						|
redis_cpu: "10m"
 | 
						|
 | 
						|
redis_memory: "200Mi"
 | 
						|
 | 
						|
redis_storage: "3Gi"
 | 
						|
 | 
						|
 | 
						|
# Crawler Channels
 | 
						|
# =========================================
 | 
						|
# Support for additional crawler release channels
 | 
						|
# If more than one channel provided, a dropdown will be shown to users
 | 
						|
# 'default' channel must always be included
 | 
						|
crawler_channels:
 | 
						|
  - id: default
 | 
						|
    image: "docker.io/webrecorder/browsertrix-crawler:latest"
 | 
						|
 | 
						|
  # Add, remove, or edit additional crawler versions below, for example:
 | 
						|
  # - id: custom_version
 | 
						|
  #   image: "<DOCKER IMAGE>"
 | 
						|
 | 
						|
crawler_pull_policy: "Always"
 | 
						|
 | 
						|
crawler_namespace: "crawlers"
 | 
						|
 | 
						|
# if set, will restrict QA to image names that are >= than this value
 | 
						|
# min_qa_crawler_image: ""
 | 
						|
 | 
						|
# optional: enable to use a persist volume claim for all crawls
 | 
						|
# can be enabled to use a multi-write shared filesystem
 | 
						|
# crawler_pv_claim: "nfs-shared-crawls"
 | 
						|
 | 
						|
# num retries
 | 
						|
crawl_retries: 1000
 | 
						|
 | 
						|
# Crawler Resources
 | 
						|
# -----------------
 | 
						|
 | 
						|
# base cpu for for 1 browser
 | 
						|
crawler_cpu_base: 900m
 | 
						|
 | 
						|
# base memory per for 1 browser
 | 
						|
crawler_memory_base: 1024Mi
 | 
						|
 | 
						|
# number of browsers per crawler instances
 | 
						|
crawler_browser_instances: 2
 | 
						|
 | 
						|
# this value is added to crawler_cpu_base, for each additional browser
 | 
						|
# crawler_cpu = crawler_cpu_base + crawler_pu_per_extra_browser * (crawler_browser_instances - 1)
 | 
						|
crawler_extra_cpu_per_browser: 600m
 | 
						|
 | 
						|
crawler_extra_memory_per_browser: 768Mi
 | 
						|
 | 
						|
# if not set, defaults to the following, but can be overridden directly:
 | 
						|
# crawler_cpu = crawler_cpu_base + crawler_cpu_per_extra_browser * (crawler_browser_instances - 1)
 | 
						|
# crawler_cpu:
 | 
						|
 | 
						|
# if not set, defaults to the following, but can be overridden directly:
 | 
						|
# crawler_memory = crawler_memory_base + crawler_memory_per_extra_browser * (crawler_browser_instances - 1)
 | 
						|
# crawler_memory:
 | 
						|
 | 
						|
 | 
						|
# max crawler memory, if set, will enable auto-resizing of crawler pods up to this size
 | 
						|
# if not set, no auto-resizing is done, and crawls always use 'crawler_memory' memory
 | 
						|
# max_crawler_memory:
 | 
						|
 | 
						|
# optional: defaults to crawler_memory_base and crawler_cpu_base if not set
 | 
						|
# profile_browser_memory:
 | 
						|
#
 | 
						|
# profile_browser_cpu:
 | 
						|
 | 
						|
# optional: set the workdir size for the profilebrowser pods
 | 
						|
# the workdir is used to store the browser profile data and other temporary files
 | 
						|
# profile_browser_workdir_size: 4Gi
 | 
						|
 | 
						|
# Other Crawler Settings
 | 
						|
# ----------------------
 | 
						|
 | 
						|
# minimum size allocated to each crawler
 | 
						|
# should be at least double crawl session size to ensure space for WACZ and browser profile data
 | 
						|
crawler_storage: "26Gi"
 | 
						|
 | 
						|
# max size at which crawler will commit current crawl session
 | 
						|
crawler_session_size_limit_bytes: "10000000000"
 | 
						|
 | 
						|
# max time in seconds after which crawler will restart, if set
 | 
						|
crawler_session_time_limit_seconds: 18000
 | 
						|
 | 
						|
crawler_liveness_port: 6065
 | 
						|
 | 
						|
# optional: use socks5 proxy for crawler and profilebrowser
 | 
						|
# crawler_socks_proxy_host: 192.0.2.1
 | 
						|
# crawler_socks_proxy_port: 9050
 | 
						|
 | 
						|
# optional: set the uid, gid and fsgroup for the crawler and profilebrowser pods
 | 
						|
# crawler_uid: 201400007
 | 
						|
# crawler_gid: 201400007
 | 
						|
# crawler_fsgroup: 201400007
 | 
						|
 | 
						|
 | 
						|
 | 
						|
# time to wait for graceful stop
 | 
						|
grace_period: 1000
 | 
						|
 | 
						|
 | 
						|
# Local Minio Pod (optional)
 | 
						|
# =========================================
 | 
						|
# set to true to use a local minio image
 | 
						|
minio_local: true
 | 
						|
 | 
						|
# enable to allow access to minio console via specified port
 | 
						|
# minio_local_console_port: 30091
 | 
						|
 | 
						|
minio_scheme: "http"
 | 
						|
minio_host: "local-minio.default:9000"
 | 
						|
 | 
						|
minio_image: docker.io/minio/minio:RELEASE.2022-10-24T18-35-07Z
 | 
						|
minio_mc_image: minio/mc
 | 
						|
minio_pull_policy: "IfNotPresent"
 | 
						|
 | 
						|
minio_local_bucket_name: &local_bucket_name "btrix-data"
 | 
						|
 | 
						|
minio_cpu: "10m"
 | 
						|
minio_memory: "1024Mi"
 | 
						|
 | 
						|
 | 
						|
# Storage
 | 
						|
# =========================================
 | 
						|
# should include the local minio bucket, if enabled, and any other available buckets for default storage
 | 
						|
 | 
						|
storages:
 | 
						|
  - name: "default"
 | 
						|
    type: "s3"
 | 
						|
    access_key: "ADMIN"
 | 
						|
    secret_key: "PASSW0RD"
 | 
						|
    bucket_name: *local_bucket_name
 | 
						|
 | 
						|
    endpoint_url: "http://local-minio.default:9000/"
 | 
						|
 | 
						|
 | 
						|
# optional: duration in minutes for WACZ download links to be valid
 | 
						|
# used by webhooks and replay
 | 
						|
# max value = 10079 (one week minus one minute)
 | 
						|
# storage_presign_duration_minutes: 10079
 | 
						|
 | 
						|
 | 
						|
# Email Options
 | 
						|
# =========================================
 | 
						|
email:
 | 
						|
  # email sending is enabled when 'smtp_host' is set to non-empty value
 | 
						|
  #ex: smtp_host: smtp.gmail.com
 | 
						|
  smtp_host: ""
 | 
						|
  smtp_port: 587
 | 
						|
  sender_email: example@example.com
 | 
						|
  password: password
 | 
						|
  reply_to_email: example@example.com
 | 
						|
  use_tls: True
 | 
						|
  support_email: support@example.com
 | 
						|
 | 
						|
 | 
						|
# Deployment options
 | 
						|
# =========================================
 | 
						|
 | 
						|
# Ingress (Optional)
 | 
						|
# Optional: if 'host' is set, a publicly accessible Ingress controller is created with an SSL cert (using letsencrypt)
 | 
						|
ingress:
 | 
						|
  #host: ""
 | 
						|
  cert_email: "test@example.com"
 | 
						|
  tls: false
 | 
						|
  # Optional: Uncomment to use your own cluster-issuer instead of default ACME https validation
 | 
						|
  # custom_cluster_issuer: custom_cluster_issuer-name
 | 
						|
 | 
						|
ingress_class: nginx
 | 
						|
 | 
						|
 | 
						|
 | 
						|
# Signing Options
 | 
						|
# =========================================
 | 
						|
# optionally enable signer
 | 
						|
signer:
 | 
						|
  enabled: false
 | 
						|
  image: webrecorder/authsign:0.5.0
 | 
						|
  # host: <set to signer domain>
 | 
						|
  # cert_email: "test@example.com
 | 
						|
  # image_pull_policy: "IfNotPresent"
 | 
						|
  # auth_token: <set to custom value>
 | 
						|
 | 
						|
signer_cpu: "5m"
 | 
						|
 | 
						|
signer_memory: "40Mi"
 | 
						|
 | 
						|
 | 
						|
# Optional: configure load balancing annotations
 | 
						|
# service:
 | 
						|
#   annotations:
 | 
						|
#     service.beta.kubernetes.io/aws-load-balancer-internal: "true"
 | 
						|
#     helm.sh/resource-policy: keep
 | 
						|
 | 
						|
# Admin services (see Chart.yaml's dependencies)
 | 
						|
# note: see `chart/examples/local-logging.yaml`
 | 
						|
addons:
 | 
						|
  admin:
 | 
						|
    logging: false
 | 
						|
 | 
						|
# metacontroller:
 |