Cleanup carwler args (#889)

* crawler args cleanup:
- move crawler args command line entirely to configmap
- add required settings like --generateWACZ and --waitOnDone to configmap to not be overridable
- values files can configure individual settings, assembled in configmap
- move disk_utilization_threshold to configmap
- add 'crawler_logging_opts' and 'crawler_extract_full_text' options to values.yaml to more easily set these options

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2023-05-30 19:29:07 -04:00 committed by GitHub
parent 08b3d706a7
commit 0284903b34
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 8 additions and 4 deletions

View File

@ -71,7 +71,7 @@ metadata:
namespace: {{ .Values.crawler_namespace }}
data:
CRAWL_ARGS: "{{ .Values.crawler_args }} --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }}"
CRAWL_ARGS: "--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --waitOnDone --collection thecrawl --screencastPort 9037"
---
apiVersion: v1

View File

@ -24,8 +24,15 @@ default_page_load_time_seconds: 120
# disk utilization threshold percentage - when used disk space passes
# this percentage of total, crawls will gracefully stop to prevent the
# disk from being filled
# This should be a string so that it can be included in crawler_args
disk_utilization_threshold: 90
# crawler logging flags
crawler_logging_opts: "stats,behaviors,debug"
# to enable, set to a value other than 'false'
crawler_extract_full_text: false
# max pages per crawl
# set to non-zero value to enforce global max pages per crawl limit
# if set, each workflow can have a lower limit, but not higher
@ -162,9 +169,6 @@ crawler_namespace: "crawlers"
# num retries
crawl_retries: 1000
# browsertrix-crawler args:
crawler_args: "--logging stats,behaviors,debug --generateWACZ --text --collection thecrawl --screencastPort 9037 --logErrorsToRedis --diskUtilization {{ .Values.disk_utilization_threshold | default 90 }} --waitOnDone"
crawler_browser_instances: 2
crawler_requests_cpu: "800m"