From 0284903b3418f071f823eab872feff5af62077f3 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 30 May 2023 19:29:07 -0400 Subject: [PATCH] Cleanup carwler args (#889) * crawler args cleanup: - move crawler args command line entirely to configmap - add required settings like --generateWACZ and --waitOnDone to configmap to not be overridable - values files can configure individual settings, assembled in configmap - move disk_utilization_threshold to configmap - add 'crawler_logging_opts' and 'crawler_extract_full_text' options to values.yaml to more easily set these options --------- Co-authored-by: Ilya Kreymer --- chart/templates/configmap.yaml | 2 +- chart/values.yaml | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 744528bf..a8f88908 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -71,7 +71,7 @@ metadata: namespace: {{ .Values.crawler_namespace }} data: - CRAWL_ARGS: "{{ .Values.crawler_args }} --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }}" + CRAWL_ARGS: "--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --waitOnDone --collection thecrawl --screencastPort 9037" --- apiVersion: v1 diff --git a/chart/values.yaml b/chart/values.yaml index d45f8eae..b400dc7a 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -24,8 +24,15 @@ default_page_load_time_seconds: 120 # disk utilization threshold percentage - when used disk space passes # this percentage of total, crawls will gracefully stop to prevent the # disk from being filled +# This should be a string so that it can be included in crawler_args disk_utilization_threshold: 90 +# crawler logging flags +crawler_logging_opts: "stats,behaviors,debug" + +# to enable, set to a value other than 'false' +crawler_extract_full_text: false + # max pages per crawl # set to non-zero value to enforce global max pages per crawl limit # if set, each workflow can have a lower limit, but not higher @@ -162,9 +169,6 @@ crawler_namespace: "crawlers" # num retries crawl_retries: 1000 -# browsertrix-crawler args: -crawler_args: "--logging stats,behaviors,debug --generateWACZ --text --collection thecrawl --screencastPort 9037 --logErrorsToRedis --diskUtilization {{ .Values.disk_utilization_threshold | default 90 }} --waitOnDone" - crawler_browser_instances: 2 crawler_requests_cpu: "800m"