browsertrix/backend/btrixcloud/templates/crawler.yaml
Ilya Kreymer 989ed2a8da
Use Shared Services for Crawling, Redis, Profile Browsers (#1088)
* refactor to use shared role-based service shared across pods:
- 'crawler' service for all crawler screencasting, scales 0 .. N with crawler-<ID>-N.crawl
- 'redis' service for all redis access, redis-<ID>-0.redis
- 'browser' service for all browser access (profile browsers), browser-<ID>-0.browser
- don't create a new service per crawl/profile at all
- enable 'publishNotReadyAddresses' for potentially faster resolving, esp for redis
- remove service as type managed by operator as no longer creating services dynamically
- remove frontend var CRAWLER_SVC_SUFFIX, suffix always '.crawler' to match crawler service name
2023-08-24 20:08:53 -07:00

168 lines
4.0 KiB
YAML

# -------
# CRAWLER
# -------
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: crawl-{{ id }}
namespace: {{ namespace }}
labels:
crawl: {{ id }}
role: crawler
spec:
selector:
matchLabels:
crawl: {{ id }}
role: crawler
serviceName: crawler
replicas: {{ scale }}
podManagementPolicy: OrderedReady
# not yet supported
#persistentVolumeClaimRetentionPolicy:
# whenDeleted: Delete
# whenScaled: Delete
volumeClaimTemplates:
- metadata:
name: crawl-data
labels:
crawl: {{ id }}
role: crawler
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ requests_hd }}
{% if volume_storage_class %}
storageClassName: {{ volume_storage_class }}
{% endif %}
template:
metadata:
labels:
crawl: {{ id }}
role: crawler
{% if force_restart %}
annotations:
btrix.crawlForceRestart: "{{ force_restart }}"
{% endif %}
spec:
terminationGracePeriodSeconds: {{ termination_grace_secs }}
#nodeSelector: {{ crawl_node_selector }}
volumes:
- name: crawl-config
configMap:
name: crawl-config-{{ cid }}
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
preference:
matchExpressions:
- key: nodeType
operator: In
values:
- "{{ crawler_node_type }}"
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 2
podAffinityTerm:
topologyKey: "failure-domain.beta.kubernetes.io/zone"
labelSelector:
matchLabels:
job-name: job-{{ id }}
crawl: {{ id }}
tolerations:
- key: "nodeType"
operator: "Equal"
value: "crawling"
effect: "NoSchedule"
containers:
- name: crawler
image: {{ crawler_image }}
imagePullPolicy: {{ crawler_image_pull_policy }}
command:
- crawl
- --config
- /tmp/crawl-config.json
- --redisStoreUrl
- {{ redis_url }}
{%- if profile_filename %}
- --profile
- "@profiles/{{ profile_filename }}"
{%- endif %}
volumeMounts:
- name: crawl-config
mountPath: /tmp/crawl-config.json
subPath: crawl-config.json
readOnly: True
- name: crawl-data
mountPath: /crawls
envFrom:
- configMapRef:
name: shared-crawler-config
- secretRef:
name: storage-{{ storage_name }}
env:
- name: CRAWL_ID
value: "{{ id }}"
- name: WEBHOOK_URL
value: "{{ redis_url }}/crawls-done"
- name: STORE_PATH
value: "{{ store_path }}"
- name: STORE_FILENAME
value: "{{ store_filename }}"
- name: STORE_USER
value: "{{ userid }}"
{% if crawler_socks_proxy_host %}
- name: SOCKS_HOST
value: "{{ crawler_socks_proxy_host }}"
{% if crawler_socks_proxy_port %}
- name: SOCKS_PORT
value: "{{ crawler_socks_proxy_port }}"
{% endif %}
{% endif %}
resources:
limits:
memory: {{ crawler_memory }}
requests:
cpu: {{ crawler_cpu }}
memory: {{ crawler_memory }}
{% if crawler_liveness_port and crawler_liveness_port != '0' %}
livenessProbe:
httpGet:
path: /healthz
port: {{ crawler_liveness_port }}
initialDelaySeconds: 15
periodSeconds: 120
failureThreshold: 3
{% endif %}