Fixes #1597 New endpoints (replacing old migration) to re-add crawl pages to db from WACZs. After a few implementation attempts, we settled on using [remotezip](https://github.com/gtsystem/python-remotezip) to handle parsing of the zip files and streaming their contents line-by-line for pages. I've also modified the sync log streaming to use remotezip as well, which allows us to remove our own zip module and let remotezip handle the complexity of parsing zip files. Database inserts for pages from WACZs are batched 100 at a time to help speed up the endpoint, and the task is kicked off using asyncio.create_task so as not to block before giving a response. StorageOps now contains a method for streaming the bytes of any file in a remote WACZ, requiring only the presigned URL for the WACZ and the name of the file to stream.
353 lines
8.9 KiB
YAML
353 lines
8.9 KiB
YAML
|
|
# Crawler Settings
|
|
# =========================================
|
|
|
|
# default time to run behaviors on each page (in seconds)
|
|
default_behavior_time_seconds: 300
|
|
|
|
# default time to wait for page to fully load before running behaviors (in seconds)
|
|
default_page_load_time_seconds: 120
|
|
|
|
# disk utilization threshold percentage - when used disk space passes
|
|
# this percentage of total, crawls will gracefully stop to prevent the
|
|
# disk from being filled
|
|
# This should be a string so that it can be included in crawler_args
|
|
disk_utilization_threshold: 90
|
|
|
|
# crawler logging flags
|
|
crawler_logging_opts: "stats,behaviors,debug"
|
|
|
|
# to enable, set to a value other than 'false'
|
|
crawler_extract_full_text: false
|
|
|
|
# max pages per crawl
|
|
# set to non-zero value to enforce global max pages per crawl limit
|
|
# if 0, there is no page limit (may need to adjust crawler/redis settings for larger crawls)
|
|
# if set, each workflow can have a lower limit, but not higher
|
|
max_pages_per_crawl: 50000
|
|
|
|
|
|
# default template for generate wacz files
|
|
# supports following interpolated vars:
|
|
# @ts - current timestamp
|
|
# @hostname - full hostname
|
|
# @hostsuffix - last 14-characters of hostname
|
|
# @id - full crawl id
|
|
default_crawl_filename_template: "@ts-@hostsuffix.wacz"
|
|
|
|
|
|
# advanced: additional args to be passed to the crawler
|
|
# this is mostly for testing of new/experimental crawler flags
|
|
# standard crawler options are covered with other options above
|
|
crawler_extra_args: ""
|
|
|
|
|
|
# max allowed crawl scale per crawl
|
|
max_crawl_scale: 3
|
|
|
|
|
|
# Cluster Settings
|
|
# =========================================
|
|
name: browsertrix-cloud
|
|
|
|
# when running in the cloud, set this value to cloud-specific block storage
|
|
# keep empty to use hostPath (eg. on minikube)
|
|
volume_storage_class:
|
|
|
|
# if set, set the node selector 'nodeType' for deployment pods
|
|
# main_node_type:
|
|
|
|
# if set, set the node selector 'nodeType' to this crawling pods
|
|
# crawler_node_type:
|
|
|
|
registration_enabled: "0"
|
|
jwt_token_lifetime_minutes: 1440
|
|
|
|
# if set to "1", allow inviting same user to same org multiple times
|
|
allow_dupe_invites: "0"
|
|
|
|
# number of seconds before pending invites expire - default is 7 days
|
|
invite_expire_seconds: 604800
|
|
|
|
# base url for replayweb.page
|
|
rwp_base_url: "https://cdn.jsdelivr.net/npm/replaywebpage@1.8.12/"
|
|
|
|
superuser:
|
|
# set this to enable a superuser admin
|
|
email: admin@example.com
|
|
|
|
# optional: if not set, automatically generated
|
|
# change or remove this
|
|
password: PASSW0RD!
|
|
|
|
# Set name for default organization created with superuser
|
|
default_org: "My Organization"
|
|
|
|
|
|
# API Image
|
|
# =========================================
|
|
backend_image: "docker.io/webrecorder/browsertrix-backend:1.10.0-beta.0"
|
|
backend_pull_policy: "Always"
|
|
|
|
backend_password_secret: "PASSWORD!"
|
|
|
|
# number of backend pods
|
|
backend_num_replicas: 1
|
|
|
|
# number of workers per pod
|
|
backend_workers: 1
|
|
|
|
backend_cpu: "25m"
|
|
|
|
backend_memory: "350Mi"
|
|
|
|
# port for operator service
|
|
opPort: 8756
|
|
|
|
job_cpu: "3m"
|
|
job_memory: "70Mi"
|
|
|
|
profile_browser_idle_seconds: 60
|
|
|
|
# if set, print last 'log_failed_crawl_lines' of each failed
|
|
# crawl pod to backend operator stdout
|
|
# mostly intended for debugging / testing
|
|
# log_failed_crawl_lines: 200
|
|
|
|
|
|
# Nginx Image
|
|
# =========================================
|
|
frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.10.0-beta.0"
|
|
frontend_pull_policy: "Always"
|
|
|
|
frontend_cpu: "10m"
|
|
|
|
frontend_memory: "64Mi"
|
|
|
|
# if set, maps nginx to a fixed port on host machine
|
|
# must be between 30000 - 32767
|
|
# use for deployments on localhost when not using ingress
|
|
# if using ingress, this value is ignored
|
|
local_service_port: 30870
|
|
|
|
frontend_alias: "http://browsertrix-cloud-frontend"
|
|
|
|
|
|
# MongoDB Image
|
|
# =========================================
|
|
mongo_local: true
|
|
|
|
mongo_host: "local-mongo.default"
|
|
|
|
mongo_image: "docker.io/library/mongo:6.0.5"
|
|
mongo_pull_policy: "IfNotPresent"
|
|
|
|
mongo_cpu: "12m"
|
|
|
|
mongo_memory: "512Mi"
|
|
|
|
|
|
mongo_auth:
|
|
# specify either username + password (for local mongo)
|
|
username: root
|
|
password: PASSWORD!
|
|
|
|
# or full URL (for remote mongo server)
|
|
# db_url: mongodb+srv://...
|
|
|
|
|
|
# Redis Image
|
|
# =========================================
|
|
redis_local: true
|
|
|
|
redis_image: "redis"
|
|
redis_pull_policy: "IfNotPresent"
|
|
|
|
redis_url: "redis://local-redis.default:6379/1"
|
|
|
|
redis_cpu: "10m"
|
|
|
|
redis_memory: "200Mi"
|
|
|
|
redis_storage: "3Gi"
|
|
|
|
|
|
# Crawler Channels
|
|
# =========================================
|
|
# Support for additional crawler release channels
|
|
# If more than one channel provided, a dropdown will be shown to users
|
|
# 'default' channel must always be included
|
|
crawler_channels:
|
|
- id: default
|
|
image: "docker.io/webrecorder/browsertrix-crawler:latest"
|
|
|
|
# Add, remove, or edit additional crawler versions below, for example:
|
|
# - id: custom_version
|
|
# image: "<DOCKER IMAGE>"
|
|
|
|
crawler_pull_policy: "Always"
|
|
|
|
crawler_namespace: "crawlers"
|
|
|
|
# optional: enable to use a persist volume claim for all crawls
|
|
# can be enabled to use a multi-write shared filesystem
|
|
# crawler_pv_claim: "nfs-shared-crawls"
|
|
|
|
# num retries
|
|
crawl_retries: 1000
|
|
|
|
# Crawler Resources
|
|
# -----------------
|
|
|
|
# base cpu for for 1 browser
|
|
crawler_cpu_base: 900m
|
|
|
|
# base memory per for 1 browser
|
|
crawler_memory_base: 1024Mi
|
|
|
|
# number of browsers per crawler instances
|
|
crawler_browser_instances: 2
|
|
|
|
# this value is added to crawler_cpu_base, for each additional browser
|
|
# crawler_cpu = crawler_cpu_base + crawler_pu_per_extra_browser * (crawler_browser_instances - 1)
|
|
crawler_extra_cpu_per_browser: 600m
|
|
|
|
crawler_extra_memory_per_browser: 768Mi
|
|
|
|
# if not set, defaults to the following, but can be overridden directly:
|
|
# crawler_cpu = crawler_cpu_base + crawler_cpu_per_extra_browser * (crawler_browser_instances - 1)
|
|
# crawler_cpu:
|
|
|
|
# if not set, defaults to the following, but can be overridden directly:
|
|
# crawler_memory = crawler_memory_base + crawler_memory_per_extra_browser * (crawler_browser_instances - 1)
|
|
# crawler_memory:
|
|
|
|
# optional: defaults to crawler_memory_base and crawler_cpu_base if not set
|
|
# profile_browser_memory:
|
|
#
|
|
# profile_browser_cpu:
|
|
|
|
# Other Crawler Settings
|
|
# ----------------------
|
|
|
|
# minimum size allocated to each crawler
|
|
# should be at least double crawl session size to ensure space for WACZ
|
|
crawler_storage: "22Gi"
|
|
|
|
# max size at which crawler will commit current crawl session
|
|
crawler_session_size_limit_bytes: "10000000000"
|
|
|
|
# max time in seconds after which crawler will restart, if set
|
|
crawler_session_time_limit_seconds: 18000
|
|
|
|
crawler_liveness_port: 6065
|
|
|
|
# optional: use socks5 proxy for crawler and profilebrowser
|
|
# crawler_socks_proxy_host: 192.0.2.1
|
|
# crawler_socks_proxy_port: 9050
|
|
|
|
# time to wait for graceful stop
|
|
grace_period: 1000
|
|
|
|
|
|
# Local Minio Pod (optional)
|
|
# =========================================
|
|
# set to true to use a local minio image
|
|
minio_local: true
|
|
|
|
# enable to allow access to minio console via specified port
|
|
# minio_local_console_port: 30091
|
|
|
|
minio_scheme: "http"
|
|
minio_host: "local-minio.default:9000"
|
|
|
|
minio_image: docker.io/minio/minio:RELEASE.2022-10-24T18-35-07Z
|
|
minio_mc_image: minio/mc
|
|
minio_pull_policy: "IfNotPresent"
|
|
|
|
minio_local_bucket_name: &local_bucket_name "btrix-data"
|
|
|
|
minio_cpu: "10m"
|
|
minio_memory: "1024Mi"
|
|
|
|
|
|
# Storage
|
|
# =========================================
|
|
# should include the local minio bucket, if enabled, and any other available buckets for default storage
|
|
|
|
storages:
|
|
- name: "default"
|
|
type: "s3"
|
|
access_key: "ADMIN"
|
|
secret_key: "PASSW0RD"
|
|
bucket_name: *local_bucket_name
|
|
|
|
endpoint_url: "http://local-minio.default:9000/"
|
|
|
|
|
|
# optional: duration in minutes for WACZ download links to be valid
|
|
# used by webhooks and replay
|
|
# max value = 10079 (one week minus one minute)
|
|
# storage_presign_duration_minutes: 10079
|
|
|
|
|
|
# Email Options
|
|
# =========================================
|
|
email:
|
|
# email sending is enabled when 'smtp_host' is set to non-empty value
|
|
#ex: smtp_host: smtp.gmail.com
|
|
smtp_host: ""
|
|
smtp_port: 587
|
|
sender_email: example@example.com
|
|
password: password
|
|
reply_to_email: example@example.com
|
|
use_tls: True
|
|
support_email: support@example.com
|
|
|
|
|
|
# Deployment options
|
|
# =========================================
|
|
|
|
# Ingress (Optional)
|
|
# Optional: if 'host' is set, a publicly accessible Ingress controller is created with an SSL cert (using letsencrypt)
|
|
ingress:
|
|
#host: ""
|
|
cert_email: "test@example.com"
|
|
tls: false
|
|
# Optional: Uncomment to use your own cluster-issuer instead of default ACME https validation
|
|
# custom_cluster_issuer: custom_cluster_issuer-name
|
|
|
|
ingress_class: nginx
|
|
|
|
|
|
|
|
# Signing Options
|
|
# =========================================
|
|
# optionally enable signer
|
|
signer:
|
|
enabled: false
|
|
image: webrecorder/authsign:0.5.0
|
|
# host: <set to signer domain>
|
|
# cert_email: "test@example.com
|
|
# image_pull_policy: "IfNotPresent"
|
|
# auth_token: <set to custom value>
|
|
|
|
signer_cpu: "5m"
|
|
|
|
signer_memory: "40Mi"
|
|
|
|
|
|
# Optional: configure load balancing annotations
|
|
# service:
|
|
# annotations:
|
|
# service.beta.kubernetes.io/aws-load-balancer-internal: "true"
|
|
# helm.sh/resource-policy: keep
|
|
|
|
# Admin services (see Chart.yaml's dependencies)
|
|
# note: see `chart/examples/local-logging.yaml`
|
|
addons:
|
|
admin:
|
|
logging: false
|
|
|
|
# metacontroller:
|