browsertrix/chart/app-templates/crawler.yaml
Ilya Kreymer 3cd52342a7
Remove Crawl Workflow Configmaps (#1894)
Fixes #1893 

- Removes crawl workflow-scoped configmaps, and replaces with operator-controlled
per-crawl configmaps that only contain the json config passed to Browsertrix
Crawler (as a volume).
- Other configmap settings replaced are replaced the custom CrawlJob options
(mostly already were, just added profile_filename and storage_filename)
- Cron jobs also updated to create CrawlJob without relying on configmaps,
querying the db for additional settings.
- The `userid` associated with cron jobs is set to the user that last modified
 the schedule of the crawl, rather than whomever last modified the workflow
- Various functions that deal with updating configmaps have been removed,
including in migrations.
- New migration 0029 added to remove all crawl workflow configmaps
2024-06-28 15:25:23 -07:00

209 lines
4.5 KiB
YAML

# -------
# PVC
# -------
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ name }}
namespace: {{ namespace }}
labels:
crawl: {{ id }}
role: crawler
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ crawler_storage }}
{% if volume_storage_class %}
storageClassName: {{ volume_storage_class }}
{% endif %}
# -------
# CRAWLER
# -------
{% if not do_restart %}
---
apiVersion: v1
kind: Pod
metadata:
name: {{ name }}
namespace: {{ namespace }}
labels:
crawl: {{ id }}
role: crawler
spec:
hostname: {{ name }}
subdomain: crawler
{% if priorityClassName %}
priorityClassName: {{ priorityClassName }}
{% endif %}
restartPolicy: OnFailure
securityContext:
runAsNonRoot: true
runAsUser: {{ crawler_uid}}
runAsGroup: {{ crawler_gid}}
fsGroup: {{ crawler_fsgroup }}
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
terminationGracePeriodSeconds: {{ termination_grace_secs }}
volumes:
- name: crawl-config
configMap:
name: crawl-config-{{ id }}
{% if qa_source_crawl_id %}
- name: qa-config
configMap:
name: qa-replay-{{ qa_source_crawl_id }}
{% endif %}
- name: crawl-data
persistentVolumeClaim:
claimName: {{ name }}
affinity:
{% if crawler_node_type %}
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nodeType
operator: In
values:
- "{{ crawler_node_type }}"
{% endif %}
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 10
podAffinityTerm:
topologyKey: "kubernetes.io/hostname"
labelSelector:
matchExpressions:
- key: crawl
operator: In
values:
- {{ id }}
tolerations:
- key: nodeType
operator: Equal
value: crawling
effect: NoSchedule
- key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
effect: NoExecute
- key: node.kubernetes.io/unreachable
operator: Exists
effect: NoExecute
tolerationSeconds: 300
containers:
- name: crawler
image: {{ crawler_image }}
imagePullPolicy: {{ crawler_image_pull_policy }}
command:
- {{ "crawl" if not qa_source_crawl_id else "qa" }}
- --config
- /tmp/crawl-config.json
- --workers
- "{{ workers }}"
- --redisStoreUrl
- {{ redis_url }}
{% if qa_source_crawl_id %}
- --qaSource
- /tmp/qa/qa-config.json
{% elif profile_filename %}
- --profile
- "@{{ profile_filename }}"
{% endif %}
volumeMounts:
- name: crawl-config
mountPath: /tmp/crawl-config.json
subPath: crawl-config.json
readOnly: True
{% if qa_source_crawl_id %}
- name: qa-config
mountPath: /tmp/qa/
readOnly: True
{% endif %}
- name: crawl-data
mountPath: /crawls
envFrom:
- configMapRef:
name: shared-crawler-config
- secretRef:
name: {{ storage_secret }}
{% if signing_secret %}
- secretRef:
name: {{ signing_secret }}
{% endif %}
env:
- name: HOME
value: /crawls/home
- name: CRAWL_ID
value: "{{ id }}"
- name: WEBHOOK_URL
value: "{{ redis_url }}/crawls-done"
- name: STORE_PATH
value: "{{ storage_path }}"
- name: STORE_FILENAME
value: "{{ storage_filename }}"
- name: STORE_USER
value: "{{ userid }}"
- name: WARC_PREFIX
value: "{{ warc_prefix }}"
{% if crawler_socks_proxy_host %}
- name: SOCKS_HOST
value: "{{ crawler_socks_proxy_host }}"
{% if crawler_socks_proxy_port %}
- name: SOCKS_PORT
value: "{{ crawler_socks_proxy_port }}"
{% endif %}
{% endif %}
resources:
limits:
memory: "{{ memory_limit }}"
requests:
cpu: "{{ cpu }}"
memory: "{{ memory }}"
{% if crawler_liveness_port and crawler_liveness_port != '0' %}
livenessProbe:
httpGet:
path: /healthz
port: {{ crawler_liveness_port }}
initialDelaySeconds: 15
periodSeconds: 120
failureThreshold: 3
{% endif %}
{% endif %}