From 413fd8d7ead9d1ce788ed6af26a0a446eab96769 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 24 Feb 2023 17:24:04 -0800 Subject: [PATCH] Chart: split Crawl args into separate variables (#639) * chart crawl args cleanup: - move configurable settings out of 'crawler_args' - add 'crawler_session_size_limit_bytes' and 'crawler_session_time_limit_seconds' for --timeLimit and --sizeLimit option for crawler - remove hard-coded 'timeout' to allow configuring via crawl config - set liveness check port from existing config value - add comments that requests hd must be at least double the size limit - defaults: set crawler_requests_hd to 22GB, default crawl session size limit to 10GB --- chart/templates/configmap.yaml | 2 +- chart/values.yaml | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 0f80695c..b0320e49 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -67,7 +67,7 @@ metadata: namespace: {{ .Values.crawler_namespace }} data: - CRAWL_ARGS: "{{ .Values.crawler_args }} --workers {{ .Values.crawler_browser_instances | default 1 }}" + CRAWL_ARGS: "{{ .Values.crawler_args }} --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --healthCheckPort {{ .Values.crawler_liveness_port }}" --- apiVersion: v1 diff --git a/chart/values.yaml b/chart/values.yaml index ab34e5f3..5fa3c0b0 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -138,17 +138,25 @@ crawler_namespace: "crawlers" crawl_retries: 1000 # browsertrix-crawler args: -crawler_args: "--timeout 120 --logging stats,behaviors,debug --generateWACZ --text --collection thecrawl --screencastPort 9037 --sizeLimit 100000000000 --timeLimit 18000 --healthCheckPort 6065 --waitOnDone" +crawler_args: "--logging stats,behaviors,debug --generateWACZ --text --collection thecrawl --screencastPort 9037 --waitOnDone" -crawler_browser_instances: 4 +crawler_browser_instances: 2 crawler_requests_cpu: "800m" crawler_limits_cpu: "1200m" crawler_requests_memory: "512Mi" -crawler_limits_memory: "768Mi" +crawler_limits_memory: "1024Mi" -crawler_requests_storage: "220Gi" +# minimum size allocated to each crawler +# should be at least double crawl session size to ensure space for WACZ +crawler_requests_storage: "22Gi" + +# max size at which crawler will commit current crawl session +crawler_session_size_limit_bytes: "10000000000" + +# max time in seconds after which crawler will restart, if set +crawler_session_time_limit_seconds: 18000 crawler_liveness_port: 6065