config: add overridable 'user_agent_suffix' and 'user_agent' to values.yaml, (#910)
passed to crawler --userAgentSuffix and --userAgent params, respectively, using 'quote' to support spaces in user-agent. config: re-order settings to put 'Crawler Settings' section first, followed by 'Cluster Settings' fixes #787
This commit is contained in:
parent
a718043fa8
commit
dd757961fc
@ -71,7 +71,8 @@ metadata:
|
|||||||
namespace: {{ .Values.crawler_namespace }}
|
namespace: {{ .Values.crawler_namespace }}
|
||||||
|
|
||||||
data:
|
data:
|
||||||
CRAWL_ARGS: "--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --waitOnDone --collection thecrawl --screencastPort 9037 --logErrorsToRedis"
|
CRAWL_ARGS: >-
|
||||||
|
--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --userAgentSuffix {{ .Values.user_agent_suffix | quote }} --userAgent {{ .Values.user_agent | quote }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --waitOnDone --collection thecrawl --screencastPort 9037 --logErrorsToRedis"
|
||||||
|
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
|
@ -1,19 +1,6 @@
|
|||||||
# Settings
|
|
||||||
|
# Crawler Settings
|
||||||
# =========================================
|
# =========================================
|
||||||
name: browsertrix-cloud
|
|
||||||
|
|
||||||
# when running in the cloud, set this value to cloud-specific block storage
|
|
||||||
# keep empty to use hostPath (eg. on minikube)
|
|
||||||
volume_storage_class:
|
|
||||||
|
|
||||||
# if set, set the node selector 'nodeType' for deployment pods
|
|
||||||
# main_node_type:
|
|
||||||
|
|
||||||
# if set, set the node selector 'nodeType' to this crawling pods
|
|
||||||
# crawler_node_type:
|
|
||||||
|
|
||||||
registration_enabled: "0"
|
|
||||||
jwt_token_lifetime_minutes: 1440
|
|
||||||
|
|
||||||
# default time to run behaviors on each page (in seconds)
|
# default time to run behaviors on each page (in seconds)
|
||||||
default_behavior_time_seconds: 300
|
default_behavior_time_seconds: 300
|
||||||
@ -38,14 +25,13 @@ crawler_extract_full_text: false
|
|||||||
# if set, each workflow can have a lower limit, but not higher
|
# if set, each workflow can have a lower limit, but not higher
|
||||||
max_pages_per_crawl: 0
|
max_pages_per_crawl: 0
|
||||||
|
|
||||||
# if set to "1", allow inviting same user to same org multiple times
|
# User Agent Options
|
||||||
allow_dupe_invites: "0"
|
# set to add suffix to default browser User Agent
|
||||||
|
# user_agent_suffix:
|
||||||
|
|
||||||
# number of seconds before pending invites expire - default is 7 days
|
# set to override User Agent completely (also overrides user_agent_suffix if both are set)
|
||||||
invite_expire_seconds: 604800
|
# user_agent:
|
||||||
|
|
||||||
# base url for replayweb.page
|
|
||||||
rwp_base_url: "https://replayweb.page/"
|
|
||||||
|
|
||||||
# default template for generate wacz files
|
# default template for generate wacz files
|
||||||
# supports following interpolated vars:
|
# supports following interpolated vars:
|
||||||
@ -55,6 +41,33 @@ rwp_base_url: "https://replayweb.page/"
|
|||||||
# @id - full crawl id
|
# @id - full crawl id
|
||||||
default_crawl_filename_template: "@ts-@hostsuffix.wacz"
|
default_crawl_filename_template: "@ts-@hostsuffix.wacz"
|
||||||
|
|
||||||
|
|
||||||
|
# Cluster Settings
|
||||||
|
# =========================================
|
||||||
|
name: browsertrix-cloud
|
||||||
|
|
||||||
|
# when running in the cloud, set this value to cloud-specific block storage
|
||||||
|
# keep empty to use hostPath (eg. on minikube)
|
||||||
|
volume_storage_class:
|
||||||
|
|
||||||
|
# if set, set the node selector 'nodeType' for deployment pods
|
||||||
|
# main_node_type:
|
||||||
|
|
||||||
|
# if set, set the node selector 'nodeType' to this crawling pods
|
||||||
|
# crawler_node_type:
|
||||||
|
|
||||||
|
registration_enabled: "0"
|
||||||
|
jwt_token_lifetime_minutes: 1440
|
||||||
|
|
||||||
|
# if set to "1", allow inviting same user to same org multiple times
|
||||||
|
allow_dupe_invites: "0"
|
||||||
|
|
||||||
|
# number of seconds before pending invites expire - default is 7 days
|
||||||
|
invite_expire_seconds: 604800
|
||||||
|
|
||||||
|
# base url for replayweb.page
|
||||||
|
rwp_base_url: "https://replayweb.page/"
|
||||||
|
|
||||||
superuser:
|
superuser:
|
||||||
# set this to enable a superuser admin
|
# set this to enable a superuser admin
|
||||||
email: admin@example.com
|
email: admin@example.com
|
||||||
|
Loading…
Reference in New Issue
Block a user