# ------- # PVC # ------- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: {{ name }} namespace: {{ namespace }} labels: crawl: {{ id }} role: crawler spec: accessModes: - ReadWriteOnce resources: requests: storage: {{ storage }} {% if volume_storage_class %} storageClassName: {{ volume_storage_class }} {% endif %} # ------- # CRAWLER # ------- {% if not do_restart %} --- apiVersion: v1 kind: Pod metadata: name: {{ name }} namespace: {{ namespace }} labels: crawl: {{ id }} role: crawler network-policy: limit-crawler-egress spec: hostname: {{ name }} subdomain: crawler {% if priorityClassName %} priorityClassName: {{ priorityClassName }} {% endif %} restartPolicy: OnFailure securityContext: runAsNonRoot: true runAsUser: {{ crawler_uid }} runAsGroup: {{ crawler_gid }} fsGroup: {{ crawler_fsgroup }} allowPrivilegeEscalation: false readOnlyRootFilesystem: true terminationGracePeriodSeconds: {{ termination_grace_secs }} volumes: - name: crawl-config configMap: name: crawl-config-{{ id }} {% if qa_source_crawl_id %} - name: qa-config configMap: name: qa-replay-{{ qa_source_crawl_id }} {% endif %} - name: crawl-data persistentVolumeClaim: claimName: {{ name }} {% if proxy_id %} - name: proxies secret: secretName: proxies defaultMode: 0600 - name: force-user-and-group-name secret: secretName: force-user-and-group-name defaultMode: 0600 {% endif %} affinity: {% if crawler_node_type %} nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: nodeType operator: In values: - "{{ crawler_node_type }}" {% endif %} podAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 10 podAffinityTerm: topologyKey: "kubernetes.io/hostname" labelSelector: matchExpressions: - key: crawl operator: In values: - {{ id }} tolerations: - key: nodeType operator: Equal value: crawling effect: NoSchedule - key: node.kubernetes.io/not-ready operator: Exists tolerationSeconds: 300 effect: NoExecute - key: node.kubernetes.io/unreachable operator: Exists effect: NoExecute tolerationSeconds: 300 containers: - name: crawler image: {{ crawler_image }} imagePullPolicy: {{ crawler_image_pull_policy }} command: - {{ "crawl" if not qa_source_crawl_id else "qa" }} - --config - /tmp/config/crawl-config.json - --workers - "{{ workers }}" - --redisStoreUrl - {{ redis_url }} {% if qa_source_crawl_id %} - --qaSource - /tmp/qa/qa-config.json {% elif profile_filename %} - --profile - "@{{ profile_filename }}" {% endif %} {% if proxy_id %} - --proxyServer - "{{ proxy_url }}" {% if proxy_ssh_private_key %} - --sshProxyPrivateKeyFile - /tmp/ssh-proxy/private-key {% endif %} {% if proxy_ssh_host_public_key %} - --sshProxyKnownHostsFile - /tmp/ssh-proxy/known-hosts {% endif %} {% endif %} volumeMounts: - name: crawl-config mountPath: /tmp/config/ readOnly: True {% if qa_source_crawl_id %} - name: qa-config mountPath: /tmp/qa/ readOnly: True {% endif %} {% if proxy_id %} {% if proxy_ssh_private_key %} - name: proxies mountPath: /tmp/ssh-proxy/private-key subPath: {{ proxy_id }}-private-key readOnly: true {% endif %} {% if proxy_ssh_host_public_key %} - name: proxies mountPath: /tmp/ssh-proxy/known-hosts subPath: {{ proxy_id }}-known-hosts readOnly: true {% endif %} - name: force-user-and-group-name mountPath: /etc/passwd subPath: passwd readOnly: true - name: force-user-and-group-name mountPath: /etc/group subPath: group readOnly: true {% endif %} - name: crawl-data mountPath: /crawls envFrom: - configMapRef: name: shared-crawler-config - secretRef: name: {{ storage_secret }} {% if signing_secret %} - secretRef: name: {{ signing_secret }} {% endif %} env: - name: HOME value: /crawls/home - name: CRAWL_ID value: "{{ id }}" - name: WEBHOOK_URL value: "{{ redis_url }}/crawls-done" - name: STORE_PATH value: "{{ storage_path }}" - name: STORE_FILENAME value: "{{ storage_filename }}" - name: STORE_USER value: "{{ userid }}" - name: WARC_PREFIX value: "{{ warc_prefix }}" resources: limits: memory: "{{ memory_limit }}" requests: cpu: "{{ cpu }}" memory: "{{ memory }}" {% if crawler_liveness_port and crawler_liveness_port != '0' %} livenessProbe: httpGet: path: /healthz port: {{ crawler_liveness_port }} initialDelaySeconds: 15 periodSeconds: 120 failureThreshold: 3 {% endif %} {% endif %}