crawler arguments fixes: (#621)

- partial fix to #321, don't hard-code behavior limit into crawler args
- allow setting number of crawler browser instances via 'crawler_browser_instances' to avoid having to override the full crawler args
This commit is contained in:
Ilya Kreymer 2023-02-22 13:23:19 -08:00 committed by GitHub
parent 974aeb5e93
commit 3df6e0f146
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 4 additions and 5 deletions

View File

@ -67,10 +67,7 @@ metadata:
namespace: {{ .Values.crawler_namespace }}
data:
#CRAWL_ARGS: "{{ .Values.crawler_args }} --redisStoreUrl {{ .Values.redis_url }}"
CRAWL_ARGS: "{{ .Values.crawler_args }}"
#WEBHOOK_URL: "{{ .Values.redis_url }}/crawls-done"
CRAWL_ARGS: "{{ .Values.crawler_args }} --workers {{ .Values.crawler_browser_instances | default 1 }}"
---
apiVersion: v1

View File

@ -138,7 +138,9 @@ crawler_namespace: "crawlers"
crawl_retries: 1000
# browsertrix-crawler args:
crawler_args: "--timeout 120 --logging stats,behaviors,debug --generateWACZ --text --workers 4 --collection thecrawl --screencastPort 9037 --sizeLimit 100000000000 --timeLimit 18000 --healthCheckPort 6065 --waitOnDone --behaviorTimeout 300"
crawler_args: "--timeout 120 --logging stats,behaviors,debug --generateWACZ --text --collection thecrawl --screencastPort 9037 --sizeLimit 100000000000 --timeLimit 18000 --healthCheckPort 6065 --waitOnDone"
crawler_browser_instances: 4
crawler_requests_cpu: "800m"
crawler_limits_cpu: "1200m"