config: add overridable 'user_agent_suffix' and 'user_agent' to values.yaml, (#910)

passed to crawler --userAgentSuffix and --userAgent params, respectively, using
'quote' to support spaces in user-agent.
config: re-order settings to put 'Crawler Settings' section first, followed by 'Cluster Settings'
fixes #787
This commit is contained in:
Ilya Kreymer 2023-06-07 12:01:12 -07:00 committed by GitHub
parent a718043fa8
commit dd757961fc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 36 additions and 22 deletions

View File

@ -71,7 +71,8 @@ metadata:
namespace: {{ .Values.crawler_namespace }}
data:
CRAWL_ARGS: "--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --waitOnDone --collection thecrawl --screencastPort 9037 --logErrorsToRedis"
CRAWL_ARGS: >-
--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --userAgentSuffix {{ .Values.user_agent_suffix | quote }} --userAgent {{ .Values.user_agent | quote }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --waitOnDone --collection thecrawl --screencastPort 9037 --logErrorsToRedis"
---
apiVersion: v1

View File

@ -1,19 +1,6 @@
# Settings
# Crawler Settings
# =========================================
name: browsertrix-cloud
# when running in the cloud, set this value to cloud-specific block storage
# keep empty to use hostPath (eg. on minikube)
volume_storage_class:
# if set, set the node selector 'nodeType' for deployment pods
# main_node_type:
# if set, set the node selector 'nodeType' to this crawling pods
# crawler_node_type:
registration_enabled: "0"
jwt_token_lifetime_minutes: 1440
# default time to run behaviors on each page (in seconds)
default_behavior_time_seconds: 300
@ -38,14 +25,13 @@ crawler_extract_full_text: false
# if set, each workflow can have a lower limit, but not higher
max_pages_per_crawl: 0
# if set to "1", allow inviting same user to same org multiple times
allow_dupe_invites: "0"
# User Agent Options
# set to add suffix to default browser User Agent
# user_agent_suffix:
# number of seconds before pending invites expire - default is 7 days
invite_expire_seconds: 604800
# set to override User Agent completely (also overrides user_agent_suffix if both are set)
# user_agent:
# base url for replayweb.page
rwp_base_url: "https://replayweb.page/"
# default template for generate wacz files
# supports following interpolated vars:
@ -55,6 +41,33 @@ rwp_base_url: "https://replayweb.page/"
# @id - full crawl id
default_crawl_filename_template: "@ts-@hostsuffix.wacz"
# Cluster Settings
# =========================================
name: browsertrix-cloud
# when running in the cloud, set this value to cloud-specific block storage
# keep empty to use hostPath (eg. on minikube)
volume_storage_class:
# if set, set the node selector 'nodeType' for deployment pods
# main_node_type:
# if set, set the node selector 'nodeType' to this crawling pods
# crawler_node_type:
registration_enabled: "0"
jwt_token_lifetime_minutes: 1440
# if set to "1", allow inviting same user to same org multiple times
allow_dupe_invites: "0"
# number of seconds before pending invites expire - default is 7 days
invite_expire_seconds: 604800
# base url for replayweb.page
rwp_base_url: "https://replayweb.page/"
superuser:
# set this to enable a superuser admin
email: admin@example.com