Attempt to auto-adjust PVC storage if: - used storage (as reported in redis by the crawler) * 2.5 > total_storage - will cause PVC to resize, if possible (not supported by all drivers) - uses multiples of 1Gi, rounding up to next GB - AVAIL_STORAGE_RATIO hard-coded to 2.5 for now, to account for 2x space for WACZ plus change for fast updating crawls Some caveats: - only works if the storageClass used for PVCs has `allowVolumeExpansion: true`, if not, it will have no effect - designed as a last resort option: the `crawl_storage` in values and `--sizeLimit` and `--diskUtilization` should generally result in this not being needed. - can be useful in cases where a crawl is rapidly capturing a lot of content in one page, and there's no time to interrupt / restart, since the other limits apply only at page end. - May want to have crawler update the disk usage more frequently, not just at page end to make this more effective.
210 lines
4.5 KiB
YAML
210 lines
4.5 KiB
YAML
# -------
|
|
# PVC
|
|
# -------
|
|
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata:
|
|
name: {{ name }}
|
|
namespace: {{ namespace }}
|
|
labels:
|
|
crawl: {{ id }}
|
|
role: crawler
|
|
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
|
|
resources:
|
|
requests:
|
|
storage: {{ storage }}
|
|
|
|
{% if volume_storage_class %}
|
|
storageClassName: {{ volume_storage_class }}
|
|
{% endif %}
|
|
|
|
|
|
|
|
# -------
|
|
# CRAWLER
|
|
# -------
|
|
{% if not do_restart %}
|
|
---
|
|
apiVersion: v1
|
|
kind: Pod
|
|
metadata:
|
|
name: {{ name }}
|
|
namespace: {{ namespace }}
|
|
labels:
|
|
crawl: {{ id }}
|
|
role: crawler
|
|
network-policy: limit-crawler-egress
|
|
|
|
spec:
|
|
hostname: {{ name }}
|
|
subdomain: crawler
|
|
|
|
{% if priorityClassName %}
|
|
priorityClassName: {{ priorityClassName }}
|
|
{% endif %}
|
|
|
|
restartPolicy: OnFailure
|
|
|
|
securityContext:
|
|
runAsNonRoot: true
|
|
runAsUser: {{ crawler_uid}}
|
|
runAsGroup: {{ crawler_gid}}
|
|
fsGroup: {{ crawler_fsgroup }}
|
|
allowPrivilegeEscalation: false
|
|
readOnlyRootFilesystem: true
|
|
|
|
terminationGracePeriodSeconds: {{ termination_grace_secs }}
|
|
volumes:
|
|
- name: crawl-config
|
|
configMap:
|
|
name: crawl-config-{{ id }}
|
|
{% if qa_source_crawl_id %}
|
|
- name: qa-config
|
|
configMap:
|
|
name: qa-replay-{{ qa_source_crawl_id }}
|
|
{% endif %}
|
|
- name: crawl-data
|
|
persistentVolumeClaim:
|
|
claimName: {{ name }}
|
|
|
|
|
|
affinity:
|
|
{% if crawler_node_type %}
|
|
nodeAffinity:
|
|
requiredDuringSchedulingIgnoredDuringExecution:
|
|
nodeSelectorTerms:
|
|
- matchExpressions:
|
|
- key: nodeType
|
|
operator: In
|
|
values:
|
|
- "{{ crawler_node_type }}"
|
|
{% endif %}
|
|
|
|
podAffinity:
|
|
preferredDuringSchedulingIgnoredDuringExecution:
|
|
- weight: 10
|
|
podAffinityTerm:
|
|
topologyKey: "kubernetes.io/hostname"
|
|
labelSelector:
|
|
matchExpressions:
|
|
- key: crawl
|
|
operator: In
|
|
values:
|
|
- {{ id }}
|
|
|
|
tolerations:
|
|
- key: nodeType
|
|
operator: Equal
|
|
value: crawling
|
|
effect: NoSchedule
|
|
- key: node.kubernetes.io/not-ready
|
|
operator: Exists
|
|
tolerationSeconds: 300
|
|
effect: NoExecute
|
|
- key: node.kubernetes.io/unreachable
|
|
operator: Exists
|
|
effect: NoExecute
|
|
tolerationSeconds: 300
|
|
|
|
containers:
|
|
- name: crawler
|
|
image: {{ crawler_image }}
|
|
imagePullPolicy: {{ crawler_image_pull_policy }}
|
|
command:
|
|
- {{ "crawl" if not qa_source_crawl_id else "qa" }}
|
|
- --config
|
|
- /tmp/crawl-config.json
|
|
- --workers
|
|
- "{{ workers }}"
|
|
- --redisStoreUrl
|
|
- {{ redis_url }}
|
|
{% if qa_source_crawl_id %}
|
|
- --qaSource
|
|
- /tmp/qa/qa-config.json
|
|
{% elif profile_filename %}
|
|
- --profile
|
|
- "@{{ profile_filename }}"
|
|
{% endif %}
|
|
volumeMounts:
|
|
- name: crawl-config
|
|
mountPath: /tmp/crawl-config.json
|
|
subPath: crawl-config.json
|
|
readOnly: True
|
|
|
|
{% if qa_source_crawl_id %}
|
|
- name: qa-config
|
|
mountPath: /tmp/qa/
|
|
readOnly: True
|
|
{% endif %}
|
|
|
|
- name: crawl-data
|
|
mountPath: /crawls
|
|
envFrom:
|
|
- configMapRef:
|
|
name: shared-crawler-config
|
|
|
|
- secretRef:
|
|
name: {{ storage_secret }}
|
|
|
|
{% if signing_secret %}
|
|
- secretRef:
|
|
name: {{ signing_secret }}
|
|
{% endif %}
|
|
|
|
env:
|
|
- name: HOME
|
|
value: /crawls/home
|
|
|
|
- name: CRAWL_ID
|
|
value: "{{ id }}"
|
|
|
|
- name: WEBHOOK_URL
|
|
value: "{{ redis_url }}/crawls-done"
|
|
|
|
- name: STORE_PATH
|
|
value: "{{ storage_path }}"
|
|
|
|
- name: STORE_FILENAME
|
|
value: "{{ storage_filename }}"
|
|
|
|
- name: STORE_USER
|
|
value: "{{ userid }}"
|
|
|
|
- name: WARC_PREFIX
|
|
value: "{{ warc_prefix }}"
|
|
|
|
{% if crawler_socks_proxy_host %}
|
|
- name: SOCKS_HOST
|
|
value: "{{ crawler_socks_proxy_host }}"
|
|
{% if crawler_socks_proxy_port %}
|
|
- name: SOCKS_PORT
|
|
value: "{{ crawler_socks_proxy_port }}"
|
|
{% endif %}
|
|
{% endif %}
|
|
|
|
resources:
|
|
limits:
|
|
memory: "{{ memory_limit }}"
|
|
|
|
requests:
|
|
cpu: "{{ cpu }}"
|
|
memory: "{{ memory }}"
|
|
|
|
{% if crawler_liveness_port and crawler_liveness_port != '0' %}
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /healthz
|
|
port: {{ crawler_liveness_port }}
|
|
|
|
initialDelaySeconds: 15
|
|
periodSeconds: 120
|
|
failureThreshold: 3
|
|
{% endif %}
|
|
|
|
{% endif %}
|