browsertrix/backend/btrixcloud/k8s/templates/crawler.yaml
Ilya Kreymer 793611e5bb
add exclusion api, fixes #311 (#349)
* add exclusion api, fixes #311
add new apis: `POST crawls/{crawl_id}/exclusion?regex=...` and `DELETE crawls/{crawl_id}/exclusion?regex=...` which will:
- create new config with add 'regex' as exclusion (deleting or making inactive previous config) OR remove as exclusion.
- update crawl to point to new config
- update statefulset to point to new config, causing crawler pods to restart
- filter out urls matching 'regex' from both queue and seen list (currently a bit slow) (when adding only)
- return 400 if exclusion already existing when adding, or doesn't exist when removing
- api reads redis list in reverse to match how exclusion queue is used
2022-11-12 17:24:30 -08:00

343 lines
7.7 KiB
YAML

# --------
# REDIS
# --------
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: redis-{{ id }}
namespace: {{ namespace }}
labels:
crawl: {{ id }}
role: redis
spec:
selector:
matchLabels:
crawl: {{ id }}
role: redis
serviceName: redis-{{ id }}
replicas: 1
podManagementPolicy: Parallel
persistentVolumeClaimRetentionPolicy:
whenDeleted: Delete
whenScaled: Delete
volumeClaimTemplates:
- metadata:
name: redis-data
labels:
crawl: {{ id }}
role: crawler
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
{% if volume_storage_class %}
storageClassName: {{ volume_storage_class }}
{% endif %}
template:
metadata:
labels:
crawl: {{ id }}
role: redis
spec:
terminationGracePeriodSeconds: 10
#nodeSelector: {{ crawl_node_selector }}
volumes:
- name: shared-redis-conf
configMap:
name: shared-job-config
items:
- key: redis.conf
path: redis.conf
# - name: redis-conf
# emptyDir: {}
#initContainers:
# - name: init-redis
# image: {{ redis_image }}
# imagePullPolicy: {{ redis_image_pull_policy }}
# command:
# - bash
# - "-c"
# - |
# set -ex
# # if no ordinal found in hostname, something is wrong, exit
# [[ `hostname` =~ (.+)-([0-9]+)$ ]] || exit 1
# ordinal=${BASH_REMATCH[2]}
# base=${BASH_REMATCH[1]}
# # if ordinal is 0, use main config, otherwise use replica
# cp /shared-redis-conf/redis.conf /redis-conf/redis.conf
# if [[ $ordinal -ne 0 ]]; then
# echo "replicaof $base-0.$base 6379" >> /redis-conf/redis.conf
# fi
# volumeMounts:
# - name: shared-redis-conf
# mountPath: /shared-redis-conf
# - name: redis-conf
# mountPath: /redis-conf
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
preference:
matchExpressions:
- key: nodeType
operator: In
values:
- "{{ redis_node_type }}"
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 2
podAffinityTerm:
topologyKey: "failure-domain.beta.kubernetes.io/zone"
labelSelector:
matchLabels:
job-name: job-{{ id }}
crawl: {{ id }}
tolerations:
- key: "nodeType"
operator: "Equal"
value: "crawling"
effect: "NoSchedule"
containers:
- name: redis
image: {{ redis_image }}
imagePullPolicy: {{ redis_image_pull_policy }}
args: ["/redis-conf/redis.conf", "--appendonly", "yes"]
volumeMounts:
- name: redis-data
mountPath: /data
- name: shared-redis-conf
mountPath: /redis-conf
resources:
limits:
cpu: {{ redis_limits_cpu }}
memory: {{ redis_limits_memory }}
requests:
cpu: {{ redis_requests_cpu }}
memory: {{ redis_requests_memory }}
readinessProbe:
exec:
command:
- redis-cli
- ping
---
apiVersion: v1
kind: Service
metadata:
name: redis-{{ id }}
labels:
crawl: {{ id }}
role: redis
spec:
clusterIP: None
selector:
crawl: {{ id }}
role: redis
ports:
- protocol: TCP
port: 6379
name: redis
# -------
# CRAWLER
# -------
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: crawl-{{ id }}
namespace: {{ namespace }}
labels:
crawl: {{ id }}
role: crawler
spec:
selector:
matchLabels:
crawl: {{ id }}
role: crawler
serviceName: crawl-{{ id }}
replicas: {{ scale }}
podManagementPolicy: Parallel
persistentVolumeClaimRetentionPolicy:
whenDeleted: Delete
whenScaled: Delete
volumeClaimTemplates:
- metadata:
name: crawl-data
labels:
crawl: {{ id }}
role: crawler
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ requests_hd }}
{% if volume_storage_class %}
storageClassName: {{ volume_storage_class }}
{% endif %}
template:
metadata:
labels:
crawl: {{ id }}
role: crawler
spec:
terminationGracePeriodSeconds: {{ termination_grace_secs }}
#nodeSelector: {{ crawl_node_selector }}
volumes:
- name: crawl-config
configMap:
name: crawl-config-{{ cid }}
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
preference:
matchExpressions:
- key: nodeType
operator: In
values:
- "{{ crawler_node_type }}"
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 2
podAffinityTerm:
topologyKey: "failure-domain.beta.kubernetes.io/zone"
labelSelector:
matchLabels:
job-name: job-{{ id }}
crawl: {{ id }}
tolerations:
- key: "nodeType"
operator: "Equal"
value: "crawling"
effect: "NoSchedule"
containers:
- name: crawler
image: {{ crawler_image }}
imagePullPolicy: {{ crawler_image_pull_policy }}
command:
- crawl
- --config
- /tmp/crawl-config.json
- --redisStoreUrl
- {{ redis_url }}
{%- if env.PROFILE_FILENAME %}
- --profile
- "@profiles/{{ env.PROFILE_FILENAME }}"
{%- endif %}
volumeMounts:
- name: crawl-config
mountPath: /tmp/crawl-config.json
subPath: crawl-config.json
readOnly: True
- name: crawl-data
mountPath: /crawls
envFrom:
- configMapRef:
name: shared-crawler-config
- secretRef:
name: storage-{{ storage_name }}
env:
- name: CRAWL_ID
value: {{ id }}
- name: WEBHOOK_URL
value: {{ redis_url }}/crawls-done
- name: STORE_PATH
value: {{ env.STORE_PATH }}
- name: STORE_FILENAME
value: {{ env.STORAGE_FILENAME }}
- name: STORE_USER
value: {{ env.USER_ID }}
resources:
limits:
cpu: {{ crawler_limits_cpu }}
memory: {{ crawler_limits_memory }}
requests:
cpu: {{ crawler_requests_cpu }}
memory: {{ crawler_requests_memory }}
{% if crawler_liveness_port and crawler_liveness_port != '0' %}
livenessProbe:
httpGet:
path: /healthz
port: {{ crawler_liveness_port }}
initialDelaySeconds: 15
periodSeconds: 120
failureThreshold: 3
{% endif %}
---
apiVersion: v1
kind: Service
metadata:
name: crawl-{{ id }}
labels:
crawl: {{ id }}
role: crawler
spec:
clusterIP: None
selector:
crawl: {{ id }}
role: crawler
ports:
- protocol: TCP
port: 9037
name: screencast