334 lines
		
	
	
		
			8.4 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			334 lines
		
	
	
		
			8.4 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
| 
 | |
| # Crawler Settings
 | |
| # =========================================
 | |
| 
 | |
| # default time to run behaviors on each page (in seconds)
 | |
| default_behavior_time_seconds: 300
 | |
| 
 | |
| # default time to wait for page to fully load before running behaviors (in seconds)
 | |
| default_page_load_time_seconds: 120
 | |
| 
 | |
| # disk utilization threshold percentage - when used disk space passes
 | |
| # this percentage of total, crawls will gracefully stop to prevent the
 | |
| # disk from being filled
 | |
| # This should be a string so that it can be included in crawler_args
 | |
| disk_utilization_threshold: 90
 | |
| 
 | |
| # crawler logging flags
 | |
| crawler_logging_opts: "stats,behaviors,debug"
 | |
| 
 | |
| # to enable, set to a value other than 'false'
 | |
| crawler_extract_full_text: false
 | |
| 
 | |
| # max pages per crawl
 | |
| # set to non-zero value to enforce global max pages per crawl limit
 | |
| # if 0, there is no page limit (may need to adjust crawler/redis settings for larger crawls)
 | |
| # if set, each workflow can have a lower limit, but not higher
 | |
| max_pages_per_crawl: 50000
 | |
| 
 | |
| # User Agent Options
 | |
| # set to add suffix to default browser User Agent
 | |
| # user_agent_suffix:
 | |
| 
 | |
| # set to override User Agent completely (also overrides user_agent_suffix if both are set)
 | |
| # user_agent:
 | |
| 
 | |
| 
 | |
| # default template for generate wacz files
 | |
| # supports following interpolated vars:
 | |
| # @ts - current timestamp
 | |
| # @hostname - full hostname
 | |
| # @hostsuffix - last 14-characters of hostname
 | |
| # @id - full crawl id
 | |
| default_crawl_filename_template: "@ts-@hostsuffix.wacz"
 | |
| 
 | |
| 
 | |
| # advanced: additional args to be passed to the crawler
 | |
| # this is mostly for testing of new/experimental crawler flags
 | |
| # standard crawler options are covered with other options above
 | |
| crawler_extra_args: ""
 | |
| 
 | |
| 
 | |
| # max allowed crawl scale per crawl
 | |
| max_crawl_scale: 3
 | |
| 
 | |
| 
 | |
| # Cluster Settings
 | |
| # =========================================
 | |
| name: browsertrix-cloud
 | |
| 
 | |
| # when running in the cloud, set this value to cloud-specific block storage
 | |
| # keep empty to use hostPath (eg. on minikube)
 | |
| volume_storage_class:
 | |
| 
 | |
| # if set, set the node selector 'nodeType' for deployment pods
 | |
| # main_node_type:
 | |
| 
 | |
| # if set, set the node selector 'nodeType' to this crawling pods
 | |
| # crawler_node_type:
 | |
| 
 | |
| registration_enabled: "0"
 | |
| jwt_token_lifetime_minutes: 1440
 | |
| 
 | |
| # if set to "1", allow inviting same user to same org multiple times
 | |
| allow_dupe_invites: "0"
 | |
| 
 | |
| # number of seconds before pending invites expire - default is 7 days
 | |
| invite_expire_seconds: 604800
 | |
| 
 | |
| # base url for replayweb.page
 | |
| rwp_base_url: "https://cdn.jsdelivr.net/npm/replaywebpage@1.8.12/"
 | |
| 
 | |
| superuser:
 | |
|   # set this to enable a superuser admin
 | |
|   email: admin@example.com
 | |
| 
 | |
|   # optional: if not set, automatically generated
 | |
|   # change or remove this
 | |
|   password: PASSW0RD!
 | |
| 
 | |
| # Set name for default organization created with superuser
 | |
| default_org: "My Organization"
 | |
| 
 | |
| 
 | |
| # API Image
 | |
| # =========================================
 | |
| backend_image: "docker.io/webrecorder/browsertrix-backend:1.8.0-beta.0"
 | |
| backend_pull_policy: "Always"
 | |
| 
 | |
| backend_password_secret: "PASSWORD!"
 | |
| 
 | |
| # number of backend pods
 | |
| backend_num_replicas: 1
 | |
| 
 | |
| # number of workers per pod
 | |
| backend_workers: 2
 | |
| 
 | |
| backend_cpu: "25m"
 | |
| 
 | |
| backend_memory: "350Mi"
 | |
| 
 | |
| # port for operator service
 | |
| opPort: 8756
 | |
| 
 | |
| job_cpu: "3m"
 | |
| job_memory: "70Mi"
 | |
| 
 | |
| profile_browser_idle_seconds: 60
 | |
| 
 | |
| # if set, print last 'log_failed_crawl_lines' of each failed
 | |
| # crawl pod to backend operator stdout
 | |
| # mostly intended for debugging / testing
 | |
| # log_failed_crawl_lines: 200
 | |
| 
 | |
| 
 | |
| # Nginx Image
 | |
| # =========================================
 | |
| frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.8.0-beta.0"
 | |
| frontend_pull_policy: "Always"
 | |
| 
 | |
| frontend_cpu: "10m"
 | |
| 
 | |
| frontend_memory: "64Mi"
 | |
| 
 | |
| # if set, maps nginx to a fixed port on host machine
 | |
| # must be between 30000 - 32767
 | |
| # use for deployments on localhost when not using ingress
 | |
| # if using ingress, this value is ignored
 | |
| local_service_port: 30870
 | |
| 
 | |
| 
 | |
| # MongoDB Image
 | |
| # =========================================
 | |
| mongo_local: true
 | |
| 
 | |
| mongo_host: "local-mongo.default"
 | |
| 
 | |
| mongo_image: "docker.io/library/mongo:6.0.5"
 | |
| mongo_pull_policy: "IfNotPresent"
 | |
| 
 | |
| mongo_cpu: "12m"
 | |
| 
 | |
| mongo_memory: "512Mi"
 | |
| 
 | |
| 
 | |
| mongo_auth:
 | |
|   # specify either username + password (for local mongo)
 | |
|   username: root
 | |
|   password: PASSWORD!
 | |
| 
 | |
|   # or full URL (for remote mongo server)
 | |
|   # db_url: mongodb+srv://...
 | |
| 
 | |
| 
 | |
| # Redis Image
 | |
| # =========================================
 | |
| redis_local: true
 | |
| 
 | |
| redis_image: "redis"
 | |
| redis_pull_policy: "IfNotPresent"
 | |
| 
 | |
| redis_url: "redis://local-redis.default:6379/1"
 | |
| 
 | |
| redis_cpu: "10m"
 | |
| 
 | |
| redis_memory: "200Mi"
 | |
| 
 | |
| redis_storage: "3Gi"
 | |
| 
 | |
| 
 | |
| # Crawler Image
 | |
| # =========================================
 | |
| 
 | |
| crawler_image: "webrecorder/browsertrix-crawler:latest"
 | |
| crawler_pull_policy: "Always"
 | |
| 
 | |
| crawler_namespace: "crawlers"
 | |
| 
 | |
| # optional: enable to use a persist volume claim for all crawls
 | |
| # can be enabled to use a multi-write shared filesystem
 | |
| # crawler_pv_claim: "nfs-shared-crawls"
 | |
| 
 | |
| # num retries
 | |
| crawl_retries: 1000
 | |
| 
 | |
| # Crawler Resources
 | |
| # -----------------
 | |
| 
 | |
| # base cpu for for 1 browser
 | |
| crawler_cpu_base: 900m
 | |
| 
 | |
| # base memory per for 1 browser
 | |
| crawler_memory_base: 1024Mi
 | |
| 
 | |
| # number of browsers per crawler instances
 | |
| crawler_browser_instances: 2
 | |
| 
 | |
| # this value is added to crawler_cpu_base, for each additional browser
 | |
| # crawler_cpu = crawler_cpu_base + crawler_pu_per_extra_browser * (crawler_browser_instances - 1)
 | |
| crawler_extra_cpu_per_browser: 600m
 | |
| 
 | |
| crawler_extra_memory_per_browser: 768Mi
 | |
| 
 | |
| # if not set, defaults to the following, but can be overridden directly:
 | |
| # crawler_cpu = crawler_cpu_base + crawler_cpu_per_extra_browser * (crawler_browser_instances - 1)
 | |
| # crawler_cpu:
 | |
| 
 | |
| # if not set, defaults to the following, but can be overridden directly:
 | |
| # crawler_memory = crawler_memory_base + crawler_memory_per_extra_browser * (crawler_browser_instances - 1)
 | |
| # crawler_memory:
 | |
| 
 | |
| # Other Crawler Settings
 | |
| # ----------------------
 | |
| 
 | |
| # minimum size allocated to each crawler
 | |
| # should be at least double crawl session size to ensure space for WACZ
 | |
| crawler_storage: "22Gi"
 | |
| 
 | |
| # max size at which crawler will commit current crawl session
 | |
| crawler_session_size_limit_bytes: "10000000000"
 | |
| 
 | |
| # max time in seconds after which crawler will restart, if set
 | |
| crawler_session_time_limit_seconds: 18000
 | |
| 
 | |
| crawler_liveness_port: 6065
 | |
| 
 | |
| # optional: use socks5 proxy for crawler and profilebrowser
 | |
| # crawler_socks_proxy_host: 192.0.2.1
 | |
| # crawler_socks_proxy_port: 9050
 | |
| 
 | |
| # time to wait for graceful stop
 | |
| grace_period: 1000
 | |
| 
 | |
| 
 | |
| # Local Minio Pod (optional)
 | |
| # =========================================
 | |
| # set to true to use a local minio image
 | |
| minio_local: true
 | |
| 
 | |
| minio_scheme: "http"
 | |
| minio_host: "local-minio.default:9000"
 | |
| 
 | |
| minio_image: docker.io/minio/minio:RELEASE.2022-10-24T18-35-07Z
 | |
| minio_mc_image: minio/mc
 | |
| minio_pull_policy: "IfNotPresent"
 | |
| 
 | |
| minio_local_bucket_name: &local_bucket_name "btrix-data"
 | |
| 
 | |
| minio_cpu: "10m"
 | |
| minio_memory: "1024Mi"
 | |
| 
 | |
| 
 | |
| # Storage
 | |
| # =========================================
 | |
| # should include the local minio bucket, if enabled, and any other available buckets for default storage
 | |
| 
 | |
| storages:
 | |
|   - name: "default"
 | |
|     access_key: "ADMIN"
 | |
|     secret_key: "PASSW0RD"
 | |
|     bucket_name: *local_bucket_name
 | |
| 
 | |
|     endpoint_url: "http://local-minio.default:9000/"
 | |
| 
 | |
| # optional: if above includes a separate storage for profiles, specify here to store profiles separately from wacz files
 | |
| # may be useful if, for example, the wacz files are public, while profiles should not be
 | |
| # shared_storage_profile:
 | |
| 
 | |
| 
 | |
| # Email Options
 | |
| # =========================================
 | |
| email:
 | |
|   # email sending is enabled when 'smtp_host' is set to non-empty value
 | |
|   #ex: smtp_host: smtp.gmail.com
 | |
|   smtp_host: ""
 | |
|   smtp_port: 587
 | |
|   sender_email: example@example.com
 | |
|   password: password
 | |
|   reply_to_email: example@example.com
 | |
|   use_tls: True
 | |
| 
 | |
| 
 | |
| # Deployment options
 | |
| # =========================================
 | |
| 
 | |
| # Ingress (Optional)
 | |
| # Optional: if 'host' is set, a publicly accessible Ingress controller is created with an SSL cert (using letsencrypt)
 | |
| ingress:
 | |
|   #host: ""
 | |
|   cert_email: "test@example.com"
 | |
|   tls: false
 | |
| 
 | |
| ingress_class: nginx
 | |
| 
 | |
| 
 | |
| # Signing Options
 | |
| # =========================================
 | |
| # optionally enable signer
 | |
| signer:
 | |
|   enabled: false
 | |
|   image: webrecorder/authsign:0.5.0
 | |
|   # host: <set to signer domain>
 | |
|   # cert_email: "test@example.com
 | |
|   # image_pull_policy: "IfNotPresent"
 | |
|   # auth_token: <set to custom value>
 | |
| 
 | |
| signer_cpu: "5m"
 | |
| 
 | |
| signer_memory: "40Mi"
 | |
| 
 | |
| 
 | |
| # Optional: configure load balancing annotations
 | |
| # service:
 | |
| #   annotations:
 | |
| #     service.beta.kubernetes.io/aws-load-balancer-internal: "true"
 | |
| #     helm.sh/resource-policy: keep
 | |
| 
 | |
| # Admin services (see Chart.yaml's dependencies)
 | |
| # note: see `chart/examples/local-logging.yaml`
 | |
| addons:
 | |
|   admin:
 | |
|     logging: false
 | |
| 
 | |
| # metacontroller:
 |