Configure crawler disk utilization threshold via helm chart (#748)

This commit is contained in:
Tessa Walsh 2023-04-06 00:51:53 -04:00 committed by GitHub
parent f6f3b7abba
commit 11ca3e678a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 7 additions and 2 deletions

View File

@ -14,5 +14,5 @@ def test_settings():
"jwtTokenLifetime": 86400,
"defaultBehaviorTimeSeconds": 300,
"maxPagesPerCrawl": 2,
"defaultPageLoadTimeSeconds": 120
"defaultPageLoadTimeSeconds": 120,
}

View File

@ -21,6 +21,11 @@ default_behavior_time_seconds: 300
# default time to wait for page to fully load before running behaviors (in seconds)
default_page_load_time_seconds: 120
# disk utilization threshold percentage - when used disk space passes
# this percentage of total, crawls will gracefully stop to prevent the
# disk from being filled
disk_utilization_threshold: 90
# max pages per crawl
# set to non-zero value to enforce global max pages per crawl limit
# if set, each workflow can have a lower limit, but not higher
@ -146,7 +151,7 @@ crawler_namespace: "crawlers"
crawl_retries: 1000
# browsertrix-crawler args:
crawler_args: "--logging stats,behaviors,debug --generateWACZ --text --collection thecrawl --screencastPort 9037 --waitOnDone"
crawler_args: "--logging stats,behaviors,debug --generateWACZ --text --collection thecrawl --screencastPort 9037 --diskUtilization {{ .Values.disk_utilization_threshold | default 90 }} --waitOnDone"
crawler_browser_instances: 2