browsertrix/chart/templates/configmap.yaml
Tessa Walsh 14189b7cfb
Add crawl pages and related API endpoints (#1516)
Fixes #1502 

- Adds pages to database as they get added to Redis during crawl
- Adds migration to add pages to database for older crawls from
pages.jsonl and extraPages.jsonl files in WACZ
- Adds GET, list GET, and PATCH update endpoints for pages
- Adds POST (add), PATCH, and POST (delete) endpoints for page notes,
each with their own id, timestamp, and user info in addition to text
- Adds page_ops methods for 1. adding resources/urls to page, and 2.
adding automated heuristics and supplemental info (mime, type, etc.) to
page (for use in crawl QA job)
- Modifies `Migration` class to accept kwargs so that we can pass in ops
classes as needed for migrations
- Deletes WACZ files and pages from database for failed crawls during
crawl_finished process
- Deletes crawl pages when a crawl is deleted

Note: Requires a crawler version 1.0.0 beta3 or later, with support for
`--writePagesToRedis` to populate pages at crawl completion. Beta 4 is
configured in the test chart, which should be upgraded to stable 1.0.0
when it's released.

Connected to https://github.com/webrecorder/browsertrix-crawler/pull/464

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2024-02-28 12:11:35 -05:00

149 lines
4.6 KiB
YAML

---
apiVersion: v1
kind: ConfigMap
metadata:
name: backend-env-config
namespace: {{ .Release.Namespace }}
data:
APP_ORIGIN: {{ .Values.ingress.tls | ternary "https" "http" }}://{{ .Values.ingress.host | default "localhost:9870" }}
CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }}
DEFAULT_NAMESPACE: {{ .Release.Namespace }}
CRAWLER_FQDN_SUFFIX: ".{{ .Values.crawler_namespace }}.svc.cluster.local"
DEFAULT_ORG: "{{ .Values.default_org }}"
INVITE_EXPIRE_SECONDS: "{{ .Values.invite_expire_seconds }}"
REGISTRATION_ENABLED: "{{ .Values.registration_enabled | default 0 }}"
ALLOW_DUPE_INVITES: "{{ .Values.allow_dupe_invites | default 0 }}"
JWT_TOKEN_LIFETIME_MINUTES: "{{ .Values.jwt_token_lifetime_minutes | default 60 }}"
DEFAULT_BEHAVIOR_TIME_SECONDS: "{{ .Values.default_behavior_time_seconds }}"
DEFAULT_PAGE_LOAD_TIME_SECONDS: "{{ .Values.default_page_load_time_seconds }}"
DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}"
MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"
RERUN_FROM_MIGRATION: "{{ .Values.rerun_from_migration }}"
PRESIGN_DURATION_MINUTES: "{{ .Values.storage_presign_duration_minutes }}"
FAST_RETRY_SECS: "{{ .Values.operator_fast_resync_secs | default 3 }}"
MAX_CRAWL_SCALE: "{{ .Values.max_crawl_scale | default 3 }}"
LOG_FAILED_CRAWL_LINES: "{{ .Values.log_failed_crawl_lines | default 0 }}"
IS_LOCAL_MINIO: "{{ .Values.minio_local }}"
STORAGES_JSON: "/ops-configs/storages.json"
CRAWLER_CHANNELS_JSON: "/ops-configs/crawler_channels.json"
---
apiVersion: v1
kind: ConfigMap
metadata:
name: shared-crawler-config
namespace: {{ .Values.crawler_namespace }}
data:
CRAWL_ARGS: >-
--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --writePagesToRedis --restartsOnError --headless --screenshot view,thumbnail {{ .Values.crawler_extra_args }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: shared-job-config
#namespace: {{ .Values.crawler_namespace }}
namespace: {{ .Release.Namespace }}
data:
config.yaml: |
namespace: {{ .Values.crawler_namespace }}
termination_grace_secs: "{{ .Values.grace_period_secs | default 600 }}"
volume_storage_class: "{{ .Values.volume_storage_class }}"
# redis
redis_image: {{ .Values.redis_image }}
redis_image_pull_policy: {{ .Values.redis_pull_policy }}
redis_cpu: "{{ .Values.redis_cpu }}"
redis_memory: "{{ .Values.redis_memory }}"
redis_storage: "{{ .Values.redis_storage }}"
# crawler
crawler_image_pull_policy: {{ .Values.crawler_pull_policy }}
crawler_cpu_base: "{{ .Values.crawler_cpu_base }}"
crawler_memory_base: "{{ .Values.crawler_memory_base }}"
crawler_extra_cpu_per_browser: "{{ .Values.crawler_extra_cpu_per_browser | default 0 }}"
crawler_extra_memory_per_browser: "{{ .Values.crawler_extra_memory_per_browser | default 0 }}"
crawler_browser_instances: "{{ .Values.crawler_browser_instances }}"
crawler_cpu: "{{ .Values.crawler_cpu }}"
crawler_memory: "{{ .Values.crawler_memory }}"
crawler_storage: "{{ .Values.crawler_storage }}"
volume_storage_class: "{{ .Values.volume_storage_class }}"
crawler_liveness_port: "{{ .Values.crawler_liveness_port | default 0 }}"
crawler_socks_proxy_host: "{{ .Values.crawler_socks_proxy_host }}"
crawler_socks_proxy_port: "{{ .Values.crawler_socks_proxy_port }}"
crawler_node_type: "{{ .Values.crawler_node_type }}"
redis_node_type: "{{ .Values.redis_node_type }}"
signing_secret: {{ and .Values.signer.enabled (not (empty .Values.signer.auth_token)) | ternary "signing-secret" "" }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: shared-redis-conf
namespace: {{ .Values.crawler_namespace }}
data:
redis.conf: |
appendonly yes
dir /data
---
apiVersion: v1
kind: ConfigMap
metadata:
name: app-templates
namespace: {{ .Release.Namespace }}
data:
{{ (.Files.Glob "app-templates/*.yaml").AsConfig | indent 2 }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: email-templates
namespace: {{ .Release.Namespace }}
data:
{{ (.Files.Glob "email-templates/*").AsConfig | indent 2 }}