Currently, the workflow crawl settings were not being included at all in QA runs. This mounts the crawl workflow config, as well as QA configmap, into QA run crawls, allowing for page limits from crawl workflow to be applied to QA runs. It also allows a different number of browser instances to be used for QA runs, as QA runs might work better with less browsers, (eg. 2 instead of 4). This can be set with `qa_browser_instances` in helm chart. Default qa browser workers to 1 if unset (for now, for best results) Fixes #1828
210 lines
4.5 KiB
YAML
210 lines
4.5 KiB
YAML
# -------
|
|
# PVC
|
|
# -------
|
|
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata:
|
|
name: {{ name }}
|
|
namespace: {{ namespace }}
|
|
labels:
|
|
crawl: {{ id }}
|
|
role: crawler
|
|
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
|
|
resources:
|
|
requests:
|
|
storage: {{ crawler_storage }}
|
|
|
|
{% if volume_storage_class %}
|
|
storageClassName: {{ volume_storage_class }}
|
|
{% endif %}
|
|
|
|
|
|
|
|
# -------
|
|
# CRAWLER
|
|
# -------
|
|
{% if not do_restart %}
|
|
---
|
|
apiVersion: v1
|
|
kind: Pod
|
|
metadata:
|
|
name: {{ name }}
|
|
namespace: {{ namespace }}
|
|
labels:
|
|
crawl: {{ id }}
|
|
role: crawler
|
|
|
|
spec:
|
|
hostname: {{ name }}
|
|
subdomain: crawler
|
|
|
|
{% if priorityClassName %}
|
|
priorityClassName: {{ priorityClassName }}
|
|
{% endif %}
|
|
|
|
restartPolicy: OnFailure
|
|
|
|
securityContext:
|
|
runAsNonRoot: true
|
|
runAsUser: {{ crawler_uid}}
|
|
runAsGroup: {{ crawler_gid}}
|
|
fsGroup: {{ crawler_fsgroup }}
|
|
allowPrivilegeEscalation: false
|
|
readOnlyRootFilesystem: true
|
|
|
|
terminationGracePeriodSeconds: {{ termination_grace_secs }}
|
|
volumes:
|
|
- name: crawl-config
|
|
configMap:
|
|
name: crawl-config-{{ cid }}
|
|
{% if qa_source_crawl_id %}
|
|
- name: qa-config
|
|
configMap:
|
|
name: qa-replay-{{ qa_source_crawl_id }}
|
|
{% endif %}
|
|
- name: crawl-data
|
|
persistentVolumeClaim:
|
|
claimName: {{ name }}
|
|
|
|
|
|
affinity:
|
|
{% if crawler_node_type %}
|
|
nodeAffinity:
|
|
requiredDuringSchedulingIgnoredDuringExecution:
|
|
nodeSelectorTerms:
|
|
- matchExpressions:
|
|
- key: nodeType
|
|
operator: In
|
|
values:
|
|
- "{{ crawler_node_type }}"
|
|
{% endif %}
|
|
|
|
podAffinity:
|
|
preferredDuringSchedulingIgnoredDuringExecution:
|
|
- weight: 10
|
|
podAffinityTerm:
|
|
topologyKey: "kubernetes.io/hostname"
|
|
labelSelector:
|
|
matchExpressions:
|
|
- key: crawl
|
|
operator: In
|
|
values:
|
|
- {{ id }}
|
|
|
|
tolerations:
|
|
- key: nodeType
|
|
operator: Equal
|
|
value: crawling
|
|
effect: NoSchedule
|
|
- key: node.kubernetes.io/not-ready
|
|
operator: Exists
|
|
tolerationSeconds: 300
|
|
effect: NoExecute
|
|
- key: node.kubernetes.io/unreachable
|
|
operator: Exists
|
|
effect: NoExecute
|
|
tolerationSeconds: 300
|
|
|
|
containers:
|
|
- name: crawler
|
|
image: {{ crawler_image }}
|
|
imagePullPolicy: {{ crawler_image_pull_policy }}
|
|
command:
|
|
- {{ "crawl" if not qa_source_crawl_id else "qa" }}
|
|
- --config
|
|
- /tmp/crawl-config.json
|
|
- --workers
|
|
- "{{ workers }}"
|
|
- --redisStoreUrl
|
|
- {{ redis_url }}
|
|
{% if qa_source_crawl_id %}
|
|
- --qaSource
|
|
- /tmp/qa-config.json
|
|
{% elif profile_filename %}
|
|
- --profile
|
|
- "@{{ profile_filename }}"
|
|
{% endif %}
|
|
volumeMounts:
|
|
- name: crawl-config
|
|
mountPath: /tmp/crawl-config.json
|
|
subPath: crawl-config.json
|
|
readOnly: True
|
|
|
|
{% if qa_source_crawl_id %}
|
|
- name: qa-config
|
|
mountPath: /tmp/qa-config.json
|
|
subPath: qa-config.json
|
|
readOnly: True
|
|
{% endif %}
|
|
|
|
- name: crawl-data
|
|
mountPath: /crawls
|
|
envFrom:
|
|
- configMapRef:
|
|
name: shared-crawler-config
|
|
|
|
- secretRef:
|
|
name: {{ storage_secret }}
|
|
|
|
{% if signing_secret %}
|
|
- secretRef:
|
|
name: {{ signing_secret }}
|
|
{% endif %}
|
|
|
|
env:
|
|
- name: HOME
|
|
value: /crawls/home
|
|
|
|
- name: CRAWL_ID
|
|
value: "{{ id }}"
|
|
|
|
- name: WEBHOOK_URL
|
|
value: "{{ redis_url }}/crawls-done"
|
|
|
|
- name: STORE_PATH
|
|
value: "{{ storage_path }}"
|
|
|
|
- name: STORE_FILENAME
|
|
value: "{{ storage_filename }}"
|
|
|
|
- name: STORE_USER
|
|
value: "{{ userid }}"
|
|
|
|
- name: WARC_PREFIX
|
|
value: "{{ warc_prefix }}"
|
|
|
|
{% if crawler_socks_proxy_host %}
|
|
- name: SOCKS_HOST
|
|
value: "{{ crawler_socks_proxy_host }}"
|
|
{% if crawler_socks_proxy_port %}
|
|
- name: SOCKS_PORT
|
|
value: "{{ crawler_socks_proxy_port }}"
|
|
{% endif %}
|
|
{% endif %}
|
|
|
|
resources:
|
|
limits:
|
|
memory: "{{ memory_limit }}"
|
|
|
|
requests:
|
|
cpu: "{{ cpu }}"
|
|
memory: "{{ memory }}"
|
|
|
|
{% if crawler_liveness_port and crawler_liveness_port != '0' %}
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /healthz
|
|
port: {{ crawler_liveness_port }}
|
|
|
|
initialDelaySeconds: 15
|
|
periodSeconds: 120
|
|
failureThreshold: 3
|
|
{% endif %}
|
|
|
|
{% endif %}
|