Fixes #2515. This PR introduces a significantly optimized logic for presigning URLs for crawls and collections. - For collections, the files needed from all crawls are looked up, and then the 'presign_urls' table is merged in one pass, resulting in a unified iterator containing files and presign urls for those files. - For crawls, the presign URLs are also looked up once, and the same iterator is used for a single crawl with passed in list of CrawlFiles - URLs that are already signed are added to the return list. - For any remaining URLs to be signed, a bulk presigning function is added, which shares an HTTP connection and signing 8 files in parallels (customizable via helm chart, though may not be needed). This function is used to call the presigning API in parallel.
216 lines
7.1 KiB
YAML
216 lines
7.1 KiB
YAML
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: backend-env-config
|
|
namespace: {{ .Release.Namespace }}
|
|
|
|
data:
|
|
APP_ORIGIN: {{ .Values.ingress.tls | ternary "https" "http" }}://{{ or .Values.ingress.host ( print "localhost:" ( .Values.local_service_port | default 9870 )) }}
|
|
|
|
CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }}
|
|
|
|
DEFAULT_NAMESPACE: {{ .Release.Namespace }}
|
|
|
|
FRONTEND_ORIGIN: {{ .Values.frontend_alias | default "http://browsertrix-cloud-frontend" }}
|
|
|
|
CRAWLER_FQDN_SUFFIX: ".{{ .Values.crawler_namespace }}{{ .Values.fqdn_suffix }}"
|
|
|
|
DEFAULT_ORG: "{{ .Values.default_org }}"
|
|
|
|
INVITE_EXPIRE_SECONDS: "{{ .Values.invite_expire_seconds }}"
|
|
|
|
REGISTRATION_ENABLED: "{{ .Values.registration_enabled | default 0 }}"
|
|
|
|
REGISTER_TO_ORG_ID: "{{ .Values.registration_org_id }}"
|
|
|
|
ALLOW_DUPE_INVITES: "{{ .Values.allow_dupe_invites | default 0 }}"
|
|
|
|
JWT_TOKEN_LIFETIME_MINUTES: "{{ .Values.jwt_token_lifetime_minutes | default 60 }}"
|
|
|
|
DEFAULT_BEHAVIOR_TIME_SECONDS: "{{ .Values.default_behavior_time_seconds }}"
|
|
|
|
DEFAULT_PAGE_LOAD_TIME_SECONDS: "{{ .Values.default_page_load_time_seconds }}"
|
|
|
|
DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}"
|
|
|
|
DEFAULT_CRAWLER_IMAGE_PULL_POLICY: "{{ .Values.crawler_pull_policy }}"
|
|
|
|
MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
|
|
|
|
IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"
|
|
|
|
RERUN_FROM_MIGRATION: "{{ .Values.rerun_from_migration }}"
|
|
MIGRATION_JOBS_SCALE: "{{ .Values.migration_jobs_scale | default 1 }}"
|
|
|
|
PRESIGN_DURATION_MINUTES: "{{ .Values.storage_presign_duration_minutes }}"
|
|
|
|
FAST_RETRY_SECS: "{{ .Values.operator_fast_resync_secs | default 3 }}"
|
|
|
|
MAX_CRAWL_SCALE: "{{ .Values.max_crawl_scale | default 3 }}"
|
|
|
|
LOG_FAILED_CRAWL_LINES: "{{ .Values.log_failed_crawl_lines | default 0 }}"
|
|
|
|
IS_LOCAL_MINIO: "{{ .Values.minio_local }}"
|
|
|
|
LOCAL_MINIO_ACCESS_PATH: "{{ .Values.minio_access_path }}"
|
|
|
|
STORAGES_JSON: "/ops-configs/storages.json"
|
|
|
|
CRAWLER_CHANNELS_JSON: "/ops-configs/crawler_channels.json"
|
|
|
|
CRAWLER_PROXIES_LAST_UPDATE: "/ops-proxy-configs/crawler_proxies_last_update"
|
|
CRAWLER_PROXIES_JSON: "/ops-proxy-configs/crawler_proxies.json"
|
|
|
|
DEFAULT_PROXY_ID: "{{ .Values.default_proxy }}"
|
|
|
|
MIN_QA_CRAWLER_IMAGE: "{{ .Values.min_qa_crawler_image }}"
|
|
|
|
MIN_AUTOCLICK_CRAWLER_IMAGE: "{{ .Values.min_autoclick_crawler_image }}"
|
|
|
|
NUM_BROWSERS: "{{ .Values.crawler_browser_instances }}"
|
|
|
|
MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}"
|
|
|
|
CRAWLER_MIN_AVAIL_STORAGE_RATIO: "{{ .Values.crawler_min_avail_storage_ratio }}"
|
|
|
|
ENABLE_AUTO_RESIZE_CRAWLERS: "{{ .Values.enable_auto_resize_crawlers }}"
|
|
|
|
BILLING_ENABLED: "{{ .Values.billing_enabled }}"
|
|
|
|
SIGN_UP_URL: "{{ .Values.sign_up_url }}"
|
|
|
|
SALES_EMAIL: "{{ .Values.sales_email }}"
|
|
|
|
USER_SURVEY_URL: "{{ .Values.user_survey_url }}"
|
|
|
|
LOG_SENT_EMAILS: "{{ .Values.email.log_sent_emails }}"
|
|
|
|
BACKEND_IMAGE: "{{ .Values.backend_image }}"
|
|
|
|
BACKEND_IMAGE_PULL_POLICY: "{{ .Values.backend_pull_policy }}"
|
|
|
|
LOCALES_ENABLED: "{{ .Values.locales_enabled }}"
|
|
|
|
REPLICA_DELETION_DELAY_DAYS: "{{ .Values.replica_deletion_delay_days | default 0 }}"
|
|
|
|
PRESIGN_BATCH_SIZE: "{{ .Values.presign_batch_size | default 8 }}"
|
|
|
|
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: shared-crawler-config
|
|
namespace: {{ .Values.crawler_namespace }}
|
|
|
|
data:
|
|
{{- define "btrix.crawler_args" }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --logBehaviorsToRedis --writePagesToRedis --restartsOnError --headless --screenshot view,thumbnail {{ .Values.crawler_extra_args }} {{- end }}
|
|
|
|
CRAWL_ARGS: {{- include "btrix.crawler_args" . }}
|
|
|
|
# disable behaviors for QA runs, otherwise use same args
|
|
QA_ARGS: {{- include "btrix.crawler_args" . }} --behaviors=""
|
|
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: shared-job-config
|
|
#namespace: {{ .Values.crawler_namespace }}
|
|
namespace: {{ .Release.Namespace }}
|
|
|
|
|
|
data:
|
|
config.yaml: |
|
|
namespace: {{ .Values.crawler_namespace }}
|
|
termination_grace_secs: "{{ .Values.grace_period_secs | default 600 }}"
|
|
|
|
volume_storage_class: "{{ .Values.volume_storage_class }}"
|
|
|
|
# redis
|
|
redis_image: {{ .Values.redis_image }}
|
|
redis_image_pull_policy: {{ .Values.redis_pull_policy }}
|
|
|
|
redis_cpu: "{{ .Values.redis_cpu }}"
|
|
|
|
redis_memory: "{{ .Values.redis_memory }}"
|
|
|
|
redis_storage: "{{ .Values.redis_storage }}"
|
|
|
|
# crawler
|
|
crawler_image_pull_policy: {{ .Values.crawler_pull_policy }}
|
|
|
|
crawler_cpu_base: "{{ .Values.crawler_cpu_base }}"
|
|
crawler_memory_base: "{{ .Values.crawler_memory_base }}"
|
|
|
|
crawler_extra_cpu_per_browser: "{{ .Values.crawler_extra_cpu_per_browser | default 0 }}"
|
|
crawler_extra_memory_per_browser: "{{ .Values.crawler_extra_memory_per_browser | default 0 }}"
|
|
|
|
crawler_browser_instances: "{{ .Values.crawler_browser_instances }}"
|
|
qa_browser_instances: "{{ .Values.qa_browser_instances }}"
|
|
|
|
crawler_cpu: "{{ .Values.crawler_cpu }}"
|
|
crawler_memory: "{{ .Values.crawler_memory }}"
|
|
|
|
crawler_storage: "{{ .Values.crawler_storage }}"
|
|
|
|
volume_storage_class: "{{ .Values.volume_storage_class }}"
|
|
|
|
profile_browser_cpu: "{{ .Values.profile_browser_cpu }}"
|
|
profile_browser_memory: "{{ .Values.profile_browser_memory }}"
|
|
|
|
crawler_liveness_port: "{{ .Values.crawler_liveness_port | default 0 }}"
|
|
|
|
crawler_socks_proxy_host: "{{ .Values.crawler_socks_proxy_host }}"
|
|
crawler_socks_proxy_port: "{{ .Values.crawler_socks_proxy_port }}"
|
|
|
|
crawler_uid: "{{ .Values.crawler_uid | default 201407 }}"
|
|
crawler_gid: "{{ .Values.crawler_gid | default 201407 }}"
|
|
crawler_fsgroup: "{{ .Values.crawler_fsgroup | default 201407 }}"
|
|
|
|
profile_browser_workdir_size: "{{ .Values.profile_browser_workdir_size | default "4Gi" }}"
|
|
|
|
qa_scale: "{{ .Values.qa_scale | default 1 }}"
|
|
|
|
crawler_node_type: "{{ .Values.crawler_node_type }}"
|
|
redis_node_type: "{{ .Values.redis_node_type }}"
|
|
|
|
signing_secret: {{ and .Values.signer.enabled (not (empty .Values.signer.auth_token)) | ternary "signing-secret" "" }}
|
|
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: shared-redis-conf
|
|
namespace: {{ .Values.crawler_namespace }}
|
|
|
|
data:
|
|
redis.conf: |
|
|
appendonly yes
|
|
dir /data
|
|
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: app-templates
|
|
namespace: {{ .Release.Namespace }}
|
|
|
|
data:
|
|
{{ (.Files.Glob "app-templates/*.yaml").AsConfig | indent 2 }}
|
|
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: email-templates
|
|
namespace: {{ .Release.Namespace }}
|
|
|
|
data:
|
|
{{- $email_templates := .Values.email.templates | default dict }}
|
|
{{- range tuple "failed_bg_job" "invite" "password_reset" "validate" "sub_cancel" }}
|
|
{{ . }}: |
|
|
{{ ((get $email_templates . ) | default ($.Files.Get (printf "%s/%s" "email-templates" . ))) | indent 4 }}
|
|
{{- end }}
|