Previously, the crawler pods use preferred node affinity, instead of required node affinity. This results in crawler nodes running on the main node pool. Instead, we want to ensure crawler nodes are running on dedicated node pool (if configured). - Converts 'preferred node affinity' to 'required node affinity' for the node pool, while keeping preferred pod affinity for keeping all crawler / redis pods together. - For profiles, updates to same node affinity, and also adds resource constraint to match a single crawler for profile browser, which did not have resource constraints.
181 lines
3.7 KiB
YAML
181 lines
3.7 KiB
YAML
# -------
|
|
# PVC
|
|
# -------
|
|
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata:
|
|
name: {{ name }}
|
|
namespace: {{ namespace }}
|
|
labels:
|
|
crawl: {{ id }}
|
|
role: crawler
|
|
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
|
|
resources:
|
|
requests:
|
|
storage: {{ crawler_storage }}
|
|
|
|
{% if volume_storage_class %}
|
|
storageClassName: {{ volume_storage_class }}
|
|
{% endif %}
|
|
|
|
|
|
|
|
# -------
|
|
# CRAWLER
|
|
# -------
|
|
{% if not do_restart %}
|
|
---
|
|
apiVersion: v1
|
|
kind: Pod
|
|
metadata:
|
|
name: {{ name }}
|
|
namespace: {{ namespace }}
|
|
labels:
|
|
crawl: {{ id }}
|
|
role: crawler
|
|
|
|
spec:
|
|
hostname: {{ name }}
|
|
subdomain: crawler
|
|
|
|
{% if priorityClassName %}
|
|
priorityClassName: {{ priorityClassName }}
|
|
{% endif %}
|
|
|
|
restartPolicy: OnFailure
|
|
|
|
terminationGracePeriodSeconds: {{ termination_grace_secs }}
|
|
volumes:
|
|
- name: crawl-config
|
|
configMap:
|
|
name: crawl-config-{{ cid }}
|
|
|
|
- name: crawl-data
|
|
persistentVolumeClaim:
|
|
claimName: {{ name }}
|
|
|
|
affinity:
|
|
{% if crawler_node_type %}
|
|
nodeAffinity:
|
|
requiredDuringSchedulingIgnoredDuringExecution:
|
|
nodeSelectorTerms:
|
|
- matchExpressions:
|
|
- key: nodeType
|
|
operator: In
|
|
values:
|
|
- "{{ crawler_node_type }}"
|
|
{% endif %}
|
|
|
|
podAffinity:
|
|
preferredDuringSchedulingIgnoredDuringExecution:
|
|
- weight: 10
|
|
podAffinityTerm:
|
|
topologyKey: "kubernetes.io/hostname"
|
|
labelSelector:
|
|
matchExpressions:
|
|
- key: crawl
|
|
operator: In
|
|
values:
|
|
- {{ id }}
|
|
|
|
tolerations:
|
|
- key: nodeType
|
|
operator: Equal
|
|
value: crawling
|
|
effect: NoSchedule
|
|
- key: node.kubernetes.io/not-ready
|
|
operator: Exists
|
|
tolerationSeconds: 300
|
|
effect: NoExecute
|
|
- key: node.kubernetes.io/unreachable
|
|
operator: Exists
|
|
effect: NoExecute
|
|
tolerationSeconds: 300
|
|
|
|
containers:
|
|
- name: crawler
|
|
image: {{ crawler_image }}
|
|
imagePullPolicy: {{ crawler_image_pull_policy }}
|
|
command:
|
|
- crawl
|
|
- --config
|
|
- /tmp/crawl-config.json
|
|
- --redisStoreUrl
|
|
- {{ redis_url }}
|
|
{%- if profile_filename %}
|
|
- --profile
|
|
- "@{{ profile_filename }}"
|
|
{%- endif %}
|
|
|
|
volumeMounts:
|
|
- name: crawl-config
|
|
mountPath: /tmp/crawl-config.json
|
|
subPath: crawl-config.json
|
|
readOnly: True
|
|
|
|
- name: crawl-data
|
|
mountPath: /crawls
|
|
|
|
envFrom:
|
|
- configMapRef:
|
|
name: shared-crawler-config
|
|
|
|
- secretRef:
|
|
name: {{ storage_secret }}
|
|
|
|
{% if signing_secret %}
|
|
- secretRef:
|
|
name: {{ signing_secret }}
|
|
{% endif %}
|
|
|
|
env:
|
|
- name: CRAWL_ID
|
|
value: "{{ id }}"
|
|
|
|
- name: WEBHOOK_URL
|
|
value: "{{ redis_url }}/crawls-done"
|
|
|
|
- name: STORE_PATH
|
|
value: "{{ storage_path }}"
|
|
|
|
- name: STORE_FILENAME
|
|
value: "{{ storage_filename }}"
|
|
|
|
- name: STORE_USER
|
|
value: "{{ userid }}"
|
|
|
|
{% if crawler_socks_proxy_host %}
|
|
- name: SOCKS_HOST
|
|
value: "{{ crawler_socks_proxy_host }}"
|
|
{% if crawler_socks_proxy_port %}
|
|
- name: SOCKS_PORT
|
|
value: "{{ crawler_socks_proxy_port }}"
|
|
{% endif %}
|
|
{% endif %}
|
|
|
|
resources:
|
|
limits:
|
|
memory: "{{ memory }}"
|
|
|
|
requests:
|
|
cpu: "{{ cpu }}"
|
|
memory: "{{ memory }}"
|
|
|
|
{% if crawler_liveness_port and crawler_liveness_port != '0' %}
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /healthz
|
|
port: {{ crawler_liveness_port }}
|
|
|
|
initialDelaySeconds: 15
|
|
periodSeconds: 120
|
|
failureThreshold: 3
|
|
{% endif %}
|
|
|
|
{% endif %}
|