* Btrixjobs Operator - Phase 1 (#679) - add metacontroller and custom crds - add main_op entrypoint for operator * Btrix Operator Crawl Management (#767) * operator backend: - run operator api in separate container but in same pod, with WEB_CONCURRENCY=1 - operator creates statefulsets and services for CrawlJob and ProfileJob - operator: use service hook endpoint, set port in values.yaml * crawls working with CrawlJob - jobs start with 'crawljob-' prefix - update status to reflect current crawl state - set sync time to 10 seconds by default, overridable with 'operator_resync_seconds' - mark crawl as running, failed, complete when finished - store finished status when crawl is complete - support updating scale, forcing rollover, stop via patching CrawlJob - support cancel via deletion - requires hack to content-length for patching custom resources - auto-delete of CrawlJob via 'ttlSecondsAfterFinished' - also delete pvcs until autodelete supported via statefulset (k8s >1.27) - ensure filesAdded always set correctly, keep counter in redis, add to status display - optimization: attempt to reduce automerging, by reusing volumeClaimTemplates from existing children, as these may have additional props added - add add_crawl_errors_to_db() for storing crawl errors from redis '<crawl>:e' key to mongodb when crawl is finished/failed/canceled - add .status.size to display human-readable crawl size, if available (from webrecorder/browsertrix-crawler#291) - support new page size, >0.9.0 and old page size key (changed in webrecorder/browsertrix-crawler#284) * support for scheduled jobs! - add main_scheduled_job entrypoint to run scheduled jobs - add crawl_cron_job.yaml template for declaring CronJob - CronJobs moved to default namespace * operator manages ProfileJobs: - jobs start with 'profilejob-' - update expiry time by updating ProfileJob object 'expireTime' while profile is active * refactor/cleanup: - remove k8s package - merge k8sman and basecrawlmanager into crawlmanager - move templates, k8sapi, utils into root package - delete all *_job.py files - remove dt_now, ts_now from crawls, now in utils - all db operations happen in crawl/crawlconfig/org files - move shared crawl/crawlconfig/org functions that use the db to be importable directly, including get_crawl_config, add_new_crawl, inc_crawl_stats * role binding: more secure setup, don't allow crawler namespace any k8s permissions - move cronjobs to be created in default namespace - grant default namespace access to create cronjobs in default namespace - remove role binding from crawler namespace * additional tweaks to templates: - templates: split crawler and redis statefulset into separate yaml file (in case need to load one or other separately) * stats / redis optimization: - don't update stats in mongodb on every operator sync, only when crawl is finished - for api access, read stats directly from redis to get up-to-date stats - move get_page_stats() to utils, add get_redis_url() to k8sapi to unify access * Add migration for operator changes - Update configmap for crawl configs with scale > 1 or crawlTimeout > 0 and schedule exists to recreate CronJobs - add option to rerun last migration, enabled via env var and by running helm with --set=rerun_last_migration=1 * subcharts: move crawljob and profilejob crds to separate subchart, as this seems best way to guarantee proper install order with + update on upgrade with helm, add built btrix-crds-0.1.0.tgz subchart - metacontroller: use release from ghcr, add metacontroller-helm-v4.10.1.tgz subchart * backend api fixes - ensure changing scale of crawl also updates it in the db - crawlconfigs: add 'currCrawlSize' and 'lastCrawlSize' to crawlconfig api --------- Co-authored-by: D. Lee <leepro@gmail.com> Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
180 lines
4.0 KiB
YAML
180 lines
4.0 KiB
YAML
# -------
|
|
# CRAWLER
|
|
# -------
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: StatefulSet
|
|
metadata:
|
|
name: crawl-{{ id }}
|
|
namespace: {{ namespace }}
|
|
labels:
|
|
crawl: {{ id }}
|
|
role: crawler
|
|
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
crawl: {{ id }}
|
|
role: crawler
|
|
|
|
serviceName: crawl-{{ id }}
|
|
replicas: {{ scale }}
|
|
podManagementPolicy: Parallel
|
|
|
|
# not yet supported
|
|
#persistentVolumeClaimRetentionPolicy:
|
|
# whenDeleted: Delete
|
|
# whenScaled: Delete
|
|
|
|
volumeClaimTemplates:
|
|
- metadata:
|
|
name: crawl-data
|
|
labels:
|
|
crawl: {{ id }}
|
|
role: crawler
|
|
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
|
|
resources:
|
|
requests:
|
|
storage: {{ requests_hd }}
|
|
|
|
{% if volume_storage_class %}
|
|
storageClassName: {{ volume_storage_class }}
|
|
{% endif %}
|
|
|
|
template:
|
|
metadata:
|
|
labels:
|
|
crawl: {{ id }}
|
|
role: crawler
|
|
|
|
{% if force_restart %}
|
|
annotations:
|
|
btrix.crawlForceRestart: "{{ force_restart }}"
|
|
{% endif %}
|
|
|
|
spec:
|
|
terminationGracePeriodSeconds: {{ termination_grace_secs }}
|
|
#nodeSelector: {{ crawl_node_selector }}
|
|
volumes:
|
|
- name: crawl-config
|
|
configMap:
|
|
name: crawl-config-{{ cid }}
|
|
|
|
affinity:
|
|
nodeAffinity:
|
|
preferredDuringSchedulingIgnoredDuringExecution:
|
|
- weight: 1
|
|
preference:
|
|
matchExpressions:
|
|
- key: nodeType
|
|
operator: In
|
|
values:
|
|
- "{{ crawler_node_type }}"
|
|
|
|
podAffinity:
|
|
preferredDuringSchedulingIgnoredDuringExecution:
|
|
- weight: 2
|
|
podAffinityTerm:
|
|
topologyKey: "failure-domain.beta.kubernetes.io/zone"
|
|
labelSelector:
|
|
matchLabels:
|
|
job-name: job-{{ id }}
|
|
crawl: {{ id }}
|
|
|
|
tolerations:
|
|
- key: "nodeType"
|
|
operator: "Equal"
|
|
value: "crawling"
|
|
effect: "NoSchedule"
|
|
|
|
containers:
|
|
- name: crawler
|
|
image: {{ crawler_image }}
|
|
imagePullPolicy: {{ crawler_image_pull_policy }}
|
|
command:
|
|
- crawl
|
|
- --config
|
|
- /tmp/crawl-config.json
|
|
- --redisStoreUrl
|
|
- {{ redis_url }}
|
|
{%- if profile_filename %}
|
|
- --profile
|
|
- "@profiles/{{ profile_filename }}"
|
|
{%- endif %}
|
|
|
|
volumeMounts:
|
|
- name: crawl-config
|
|
mountPath: /tmp/crawl-config.json
|
|
subPath: crawl-config.json
|
|
readOnly: True
|
|
|
|
- name: crawl-data
|
|
mountPath: /crawls
|
|
|
|
envFrom:
|
|
- configMapRef:
|
|
name: shared-crawler-config
|
|
|
|
- secretRef:
|
|
name: storage-{{ storage_name }}
|
|
|
|
env:
|
|
- name: CRAWL_ID
|
|
value: {{ id }}
|
|
|
|
- name: WEBHOOK_URL
|
|
value: {{ redis_url }}/crawls-done
|
|
|
|
- name: STORE_PATH
|
|
value: {{ store_path }}
|
|
|
|
- name: STORE_FILENAME
|
|
value: {{ store_filename }}
|
|
|
|
- name: STORE_USER
|
|
value: {{ userid }}
|
|
|
|
resources:
|
|
limits:
|
|
cpu: {{ crawler_limits_cpu }}
|
|
memory: {{ crawler_limits_memory }}
|
|
|
|
requests:
|
|
cpu: {{ crawler_requests_cpu }}
|
|
memory: {{ crawler_requests_memory }}
|
|
|
|
{% if crawler_liveness_port and crawler_liveness_port != '0' %}
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /healthz
|
|
port: {{ crawler_liveness_port }}
|
|
|
|
initialDelaySeconds: 15
|
|
periodSeconds: 120
|
|
failureThreshold: 3
|
|
{% endif %}
|
|
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: crawl-{{ id }}
|
|
labels:
|
|
crawl: {{ id }}
|
|
role: crawler
|
|
|
|
spec:
|
|
clusterIP: None
|
|
selector:
|
|
crawl: {{ id }}
|
|
role: crawler
|
|
|
|
ports:
|
|
- protocol: TCP
|
|
port: 9037
|
|
name: screencast
|