Supports running QA Runs via the QA API! Builds on top of the `issue-1498-crawl-qa-backend-support` branch, fixes #1498 Also requires the latest Browsertrix Crawler 1.1.0+ (from webrecorder/browsertrix-crawler#469 branch) Notable changes: - QARun objects contain info about QA runs, which are crawls performed on data loaded from existing crawls. - Various crawl db operations can be performed on either the crawl or `qa.` object, and core crawl fields have been moved to CoreCrawlable. - While running,`QARun` data stored in a single `qa` object, while finished qa runs are added to `qaFinished` dictionary on the Crawl. The QA list API returns data from the finished list, sorted by most recent first. - Includes additional type fixes / type safety, especially around BaseCrawl / Crawl / UploadedCrawl functionality, also creating specific get_upload(), get_basecrawl(), get_crawl() getters for internal use and get_crawl_out() for API - Support filtering and sorting pages via `qaFilterBy` (screenshotMatch, textMatch) along with `gt`, `lt`, `gte`, `lte` params to return pages based on QA results. --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
151 lines
4.7 KiB
YAML
151 lines
4.7 KiB
YAML
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: backend-env-config
|
|
namespace: {{ .Release.Namespace }}
|
|
|
|
data:
|
|
APP_ORIGIN: {{ .Values.ingress.tls | ternary "https" "http" }}://{{ .Values.ingress.host | default "localhost:9870" }}
|
|
|
|
CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }}
|
|
|
|
DEFAULT_NAMESPACE: {{ .Release.Namespace }}
|
|
|
|
FRONTEND_ORIGIN: {{ .Values.frontend_alias | default "http://browsertrix-cloud-frontend" }}
|
|
|
|
CRAWLER_FQDN_SUFFIX: ".{{ .Values.crawler_namespace }}.svc.cluster.local"
|
|
|
|
DEFAULT_ORG: "{{ .Values.default_org }}"
|
|
|
|
INVITE_EXPIRE_SECONDS: "{{ .Values.invite_expire_seconds }}"
|
|
|
|
REGISTRATION_ENABLED: "{{ .Values.registration_enabled | default 0 }}"
|
|
|
|
ALLOW_DUPE_INVITES: "{{ .Values.allow_dupe_invites | default 0 }}"
|
|
|
|
JWT_TOKEN_LIFETIME_MINUTES: "{{ .Values.jwt_token_lifetime_minutes | default 60 }}"
|
|
|
|
DEFAULT_BEHAVIOR_TIME_SECONDS: "{{ .Values.default_behavior_time_seconds }}"
|
|
|
|
DEFAULT_PAGE_LOAD_TIME_SECONDS: "{{ .Values.default_page_load_time_seconds }}"
|
|
|
|
DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}"
|
|
|
|
MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
|
|
|
|
IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"
|
|
|
|
RERUN_FROM_MIGRATION: "{{ .Values.rerun_from_migration }}"
|
|
|
|
PRESIGN_DURATION_MINUTES: "{{ .Values.storage_presign_duration_minutes }}"
|
|
|
|
FAST_RETRY_SECS: "{{ .Values.operator_fast_resync_secs | default 3 }}"
|
|
|
|
MAX_CRAWL_SCALE: "{{ .Values.max_crawl_scale | default 3 }}"
|
|
|
|
LOG_FAILED_CRAWL_LINES: "{{ .Values.log_failed_crawl_lines | default 0 }}"
|
|
|
|
IS_LOCAL_MINIO: "{{ .Values.minio_local }}"
|
|
|
|
STORAGES_JSON: "/ops-configs/storages.json"
|
|
|
|
CRAWLER_CHANNELS_JSON: "/ops-configs/crawler_channels.json"
|
|
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: shared-crawler-config
|
|
namespace: {{ .Values.crawler_namespace }}
|
|
|
|
data:
|
|
CRAWL_ARGS: >-
|
|
--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --writePagesToRedis --restartsOnError --headless --screenshot view,thumbnail {{ .Values.crawler_extra_args }}
|
|
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: shared-job-config
|
|
#namespace: {{ .Values.crawler_namespace }}
|
|
namespace: {{ .Release.Namespace }}
|
|
|
|
|
|
data:
|
|
config.yaml: |
|
|
namespace: {{ .Values.crawler_namespace }}
|
|
termination_grace_secs: "{{ .Values.grace_period_secs | default 600 }}"
|
|
|
|
volume_storage_class: "{{ .Values.volume_storage_class }}"
|
|
|
|
# redis
|
|
redis_image: {{ .Values.redis_image }}
|
|
redis_image_pull_policy: {{ .Values.redis_pull_policy }}
|
|
|
|
redis_cpu: "{{ .Values.redis_cpu }}"
|
|
|
|
redis_memory: "{{ .Values.redis_memory }}"
|
|
|
|
redis_storage: "{{ .Values.redis_storage }}"
|
|
|
|
# crawler
|
|
crawler_image_pull_policy: {{ .Values.crawler_pull_policy }}
|
|
|
|
crawler_cpu_base: "{{ .Values.crawler_cpu_base }}"
|
|
crawler_memory_base: "{{ .Values.crawler_memory_base }}"
|
|
|
|
crawler_extra_cpu_per_browser: "{{ .Values.crawler_extra_cpu_per_browser | default 0 }}"
|
|
crawler_extra_memory_per_browser: "{{ .Values.crawler_extra_memory_per_browser | default 0 }}"
|
|
|
|
crawler_browser_instances: "{{ .Values.crawler_browser_instances }}"
|
|
|
|
crawler_cpu: "{{ .Values.crawler_cpu }}"
|
|
crawler_memory: "{{ .Values.crawler_memory }}"
|
|
|
|
crawler_storage: "{{ .Values.crawler_storage }}"
|
|
|
|
volume_storage_class: "{{ .Values.volume_storage_class }}"
|
|
|
|
crawler_liveness_port: "{{ .Values.crawler_liveness_port | default 0 }}"
|
|
|
|
crawler_socks_proxy_host: "{{ .Values.crawler_socks_proxy_host }}"
|
|
crawler_socks_proxy_port: "{{ .Values.crawler_socks_proxy_port }}"
|
|
|
|
crawler_node_type: "{{ .Values.crawler_node_type }}"
|
|
redis_node_type: "{{ .Values.redis_node_type }}"
|
|
|
|
signing_secret: {{ and .Values.signer.enabled (not (empty .Values.signer.auth_token)) | ternary "signing-secret" "" }}
|
|
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: shared-redis-conf
|
|
namespace: {{ .Values.crawler_namespace }}
|
|
|
|
data:
|
|
redis.conf: |
|
|
appendonly yes
|
|
dir /data
|
|
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: app-templates
|
|
namespace: {{ .Release.Namespace }}
|
|
|
|
data:
|
|
{{ (.Files.Glob "app-templates/*.yaml").AsConfig | indent 2 }}
|
|
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: email-templates
|
|
namespace: {{ .Release.Namespace }}
|
|
|
|
data:
|
|
{{ (.Files.Glob "email-templates/*").AsConfig | indent 2 }}
|