config: add overridable 'user_agent_suffix' and 'user_agent' to values.yaml, (#910)

passed to crawler --userAgentSuffix and --userAgent params, respectively, using 'quote' to support spaces in user-agent. config: re-order settings to put 'Crawler Settings' section first, followed by 'Cluster Settings' fixes #787
2023-06-07 12:01:12 -07:00 · 2023-06-07 12:01:12 -07:00 · dd757961fc
commit dd757961fc
parent a718043fa8
2 changed files with 36 additions and 22 deletions
--- a/chart/templates/configmap.yaml
+++ b/chart/templates/configmap.yaml
@ -71,7 +71,8 @@ metadata:
  namespace: {{ .Values.crawler_namespace }}

 data:
-  CRAWL_ARGS: "--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --waitOnDone --collection thecrawl --screencastPort 9037 --logErrorsToRedis"
+  CRAWL_ARGS: >-
+    --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --userAgentSuffix {{ .Values.user_agent_suffix | quote }} --userAgent {{ .Values.user_agent | quote }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --waitOnDone --collection thecrawl --screencastPort 9037 --logErrorsToRedis"

 ---
 apiVersion: v1
--- a/chart/values.yaml
+++ b/chart/values.yaml
@ -1,19 +1,6 @@
-# Settings
+
+# Crawler Settings
 # =========================================
-name: browsertrix-cloud
-
-# when running in the cloud, set this value to cloud-specific block storage
-# keep empty to use hostPath (eg. on minikube)
-volume_storage_class:
-
-# if set, set the node selector 'nodeType' for deployment pods
-# main_node_type:
-
-# if set, set the node selector 'nodeType' to this crawling pods
-# crawler_node_type:
-
-registration_enabled: "0"
-jwt_token_lifetime_minutes: 1440

 # default time to run behaviors on each page (in seconds)
 default_behavior_time_seconds: 300
@ -38,14 +25,13 @@ crawler_extract_full_text: false
 # if set, each workflow can have a lower limit, but not higher
 max_pages_per_crawl: 0

-# if set to "1", allow inviting same user to same org multiple times
-allow_dupe_invites: "0"
+# User Agent Options
+# set to add suffix to default browser User Agent
+# user_agent_suffix:

-# number of seconds before pending invites expire - default is 7 days
-invite_expire_seconds: 604800
+# set to override User Agent completely (also overrides user_agent_suffix if both are set)
+# user_agent:

-# base url for replayweb.page
-rwp_base_url: "https://replayweb.page/"

 # default template for generate wacz files
 # supports following interpolated vars:
@ -55,6 +41,33 @@ rwp_base_url: "https://replayweb.page/"
 # @id - full crawl id
 default_crawl_filename_template: "@ts-@hostsuffix.wacz"

+
+# Cluster Settings
+# =========================================
+name: browsertrix-cloud
+
+# when running in the cloud, set this value to cloud-specific block storage
+# keep empty to use hostPath (eg. on minikube)
+volume_storage_class:
+
+# if set, set the node selector 'nodeType' for deployment pods
+# main_node_type:
+
+# if set, set the node selector 'nodeType' to this crawling pods
+# crawler_node_type:
+
+registration_enabled: "0"
+jwt_token_lifetime_minutes: 1440
+
+# if set to "1", allow inviting same user to same org multiple times
+allow_dupe_invites: "0"
+
+# number of seconds before pending invites expire - default is 7 days
+invite_expire_seconds: 604800
+
+# base url for replayweb.page
+rwp_base_url: "https://replayweb.page/"
+
 superuser:
  # set this to enable a superuser admin
  email: admin@example.com