browsertrix/chart/app-templates/crawler.yaml

# -------
# PVC
# -------

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: {{ name }}
  namespace: {{ namespace }}
  labels:
    crawl: {{ id }}
    role: crawler

spec:
  accessModes:
    - ReadWriteOnce

  resources:
    requests:
      storage: {{ crawler_storage }}

  {% if volume_storage_class %}
  storageClassName: {{ volume_storage_class }}
  {% endif %}


# -------
# CRAWLER
# -------
{% if not do_restart %}
---
apiVersion: v1
kind: Pod
metadata:
  name: {{ name }}
  namespace: {{ namespace }}
  labels:
    crawl: {{ id }}
    role: crawler

spec:
  hostname: {{ name }}
  subdomain: crawler

  {% if priorityClassName %}
  priorityClassName: {{ priorityClassName }}
  {% endif %}

  restartPolicy: OnFailure

  securityContext:
    runAsNonRoot: true
    runAsUser: {{ crawler_uid}}
    runAsGroup: {{ crawler_gid}}
    fsGroup: {{ crawler_fsgroup }}
    allowPrivilegeEscalation: false
    readOnlyRootFilesystem: true

  terminationGracePeriodSeconds: {{ termination_grace_secs }}
  volumes:
    - name: crawl-config
      configMap:
        name: crawl-config-{{ cid }}
    {% if qa_source_crawl_id %}
    - name: qa-config
      configMap:
        name: qa-replay-{{ qa_source_crawl_id }}
    {% endif %}
    - name: crawl-data
      persistentVolumeClaim:
        claimName: {{ name }}


  affinity:
{% if crawler_node_type %}
    nodeAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
        nodeSelectorTerms:
          - matchExpressions:
            - key: nodeType
              operator: In
              values:
                - "{{ crawler_node_type }}"
{% endif %}

    podAffinity:
      preferredDuringSchedulingIgnoredDuringExecution:
        - weight: 10
          podAffinityTerm:
            topologyKey: "kubernetes.io/hostname"
            labelSelector:
              matchExpressions:
              - key: crawl
                operator: In
                values:
                - {{ id }}

  tolerations:
    - key: nodeType
      operator: Equal
      value: crawling
      effect: NoSchedule
    - key: node.kubernetes.io/not-ready
      operator: Exists
      tolerationSeconds: 300
      effect: NoExecute
    - key: node.kubernetes.io/unreachable
      operator: Exists
      effect: NoExecute
      tolerationSeconds: 300

  containers:
    - name: crawler
      image: {{ crawler_image }}
      imagePullPolicy: {{ crawler_image_pull_policy }}
      command:
        - {{ "crawl" if not qa_source_crawl_id else "qa" }}
        - --config
        - /tmp/crawl-config.json
        - --workers
        - "{{ workers }}"
        - --redisStoreUrl
        - {{ redis_url }}
      {% if qa_source_crawl_id %}
        - --qaSource
        - /tmp/qa-config.json
      {% elif profile_filename %}
        - --profile
        - "@{{ profile_filename }}"
      {% endif %}
      volumeMounts:
        - name: crawl-config
          mountPath: /tmp/crawl-config.json
          subPath: crawl-config.json
          readOnly: True

      {% if qa_source_crawl_id %}
        - name: qa-config
          mountPath: /tmp/qa-config.json
          subPath: qa-config.json
          readOnly: True
      {% endif %}

        - name: crawl-data
          mountPath: /crawls
      envFrom:
        - configMapRef:
            name: shared-crawler-config

        - secretRef:
            name: {{ storage_secret }}

      {% if signing_secret %}
        - secretRef:
            name: {{ signing_secret }}
      {% endif %}

      env:
        - name: HOME
          value: /crawls/home

        - name: CRAWL_ID
          value: "{{ id }}"

        - name: WEBHOOK_URL
          value: "{{ redis_url }}/crawls-done"

        - name: STORE_PATH
          value: "{{ storage_path }}"

        - name: STORE_FILENAME
          value: "{{ storage_filename }}"

        - name: STORE_USER
          value: "{{ userid }}"

        - name: WARC_PREFIX
          value: "{{ warc_prefix }}"

    {% if crawler_socks_proxy_host %}
        - name: SOCKS_HOST
          value: "{{ crawler_socks_proxy_host }}"
      {% if crawler_socks_proxy_port %}
        - name: SOCKS_PORT
          value: "{{ crawler_socks_proxy_port }}"
      {% endif %}
    {% endif %}

      resources:
        limits:
          memory: "{{ memory_limit }}"

        requests:
          cpu: "{{ cpu }}"
          memory: "{{ memory }}"

      {% if crawler_liveness_port and crawler_liveness_port != '0' %}
      livenessProbe:
        httpGet:
          path: /healthz
          port: {{ crawler_liveness_port }}

        initialDelaySeconds: 15
        periodSeconds: 120
        failureThreshold: 3
      {% endif %}

{% endif %}