Cleanup carwler args (#889)

* crawler args cleanup: - move crawler args command line entirely to configmap - add required settings like --generateWACZ and --waitOnDone to configmap to not be overridable - values files can configure individual settings, assembled in configmap - move disk_utilization_threshold to configmap - add 'crawler_logging_opts' and 'crawler_extract_full_text' options to values.yaml to more easily set these options --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2023-05-30 19:29:07 -04:00 · 2023-05-30 19:29:07 -04:00 · 0284903b34
commit 0284903b34
parent 08b3d706a7
2 changed files with 8 additions and 4 deletions
--- a/chart/templates/configmap.yaml
+++ b/chart/templates/configmap.yaml
@ -71,7 +71,7 @@ metadata:
  namespace: {{ .Values.crawler_namespace }}

 data:
-  CRAWL_ARGS: "{{ .Values.crawler_args }} --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }}"
+  CRAWL_ARGS: "--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --waitOnDone --collection thecrawl --screencastPort 9037"

 ---
 apiVersion: v1
--- a/chart/values.yaml
+++ b/chart/values.yaml
@ -24,8 +24,15 @@ default_page_load_time_seconds: 120
 # disk utilization threshold percentage - when used disk space passes
 # this percentage of total, crawls will gracefully stop to prevent the
 # disk from being filled
+# This should be a string so that it can be included in crawler_args
 disk_utilization_threshold: 90

+# crawler logging flags
+crawler_logging_opts: "stats,behaviors,debug"
+
+# to enable, set to a value other than 'false'
+crawler_extract_full_text: false
+
 # max pages per crawl
 # set to non-zero value to enforce global max pages per crawl limit
 # if set, each workflow can have a lower limit, but not higher
@ -162,9 +169,6 @@ crawler_namespace: "crawlers"
 # num retries
 crawl_retries: 1000

-# browsertrix-crawler args:
-crawler_args: "--logging stats,behaviors,debug --generateWACZ --text --collection thecrawl --screencastPort 9037 --logErrorsToRedis --diskUtilization {{ .Values.disk_utilization_threshold | default 90 }} --waitOnDone"
-
 crawler_browser_instances: 2

 crawler_requests_cpu: "800m"