Cleanup carwler args (#889)
* crawler args cleanup: - move crawler args command line entirely to configmap - add required settings like --generateWACZ and --waitOnDone to configmap to not be overridable - values files can configure individual settings, assembled in configmap - move disk_utilization_threshold to configmap - add 'crawler_logging_opts' and 'crawler_extract_full_text' options to values.yaml to more easily set these options --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
08b3d706a7
commit
0284903b34
@ -71,7 +71,7 @@ metadata:
|
||||
namespace: {{ .Values.crawler_namespace }}
|
||||
|
||||
data:
|
||||
CRAWL_ARGS: "{{ .Values.crawler_args }} --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }}"
|
||||
CRAWL_ARGS: "--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --waitOnDone --collection thecrawl --screencastPort 9037"
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
|
@ -24,8 +24,15 @@ default_page_load_time_seconds: 120
|
||||
# disk utilization threshold percentage - when used disk space passes
|
||||
# this percentage of total, crawls will gracefully stop to prevent the
|
||||
# disk from being filled
|
||||
# This should be a string so that it can be included in crawler_args
|
||||
disk_utilization_threshold: 90
|
||||
|
||||
# crawler logging flags
|
||||
crawler_logging_opts: "stats,behaviors,debug"
|
||||
|
||||
# to enable, set to a value other than 'false'
|
||||
crawler_extract_full_text: false
|
||||
|
||||
# max pages per crawl
|
||||
# set to non-zero value to enforce global max pages per crawl limit
|
||||
# if set, each workflow can have a lower limit, but not higher
|
||||
@ -162,9 +169,6 @@ crawler_namespace: "crawlers"
|
||||
# num retries
|
||||
crawl_retries: 1000
|
||||
|
||||
# browsertrix-crawler args:
|
||||
crawler_args: "--logging stats,behaviors,debug --generateWACZ --text --collection thecrawl --screencastPort 9037 --logErrorsToRedis --diskUtilization {{ .Values.disk_utilization_threshold | default 90 }} --waitOnDone"
|
||||
|
||||
crawler_browser_instances: 2
|
||||
|
||||
crawler_requests_cpu: "800m"
|
||||
|
Loading…
Reference in New Issue
Block a user