browsertrix/backend/btrixcloud/templates/crawler.yaml

# -------
# CRAWLER
# -------
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: crawl-{{ id }}
  namespace: {{ namespace }}
  labels:
    crawl: {{ id }}
    role: crawler

spec:
  selector:
    matchLabels:
      crawl: {{ id }}
      role: crawler

  serviceName: crawl-{{ id }}
  replicas: {{ scale }}
  podManagementPolicy: Parallel

  # not yet supported
  #persistentVolumeClaimRetentionPolicy:
  #  whenDeleted: Delete
  #  whenScaled: Delete

  volumeClaimTemplates:
    - metadata:
        name: crawl-data
        labels:
          crawl: {{ id }}
          role: crawler

      spec:
        accessModes:
          - ReadWriteOnce

        resources:
          requests:
            storage: {{ requests_hd }}

        {% if volume_storage_class %}
        storageClassName: {{ volume_storage_class }}
        {% endif %}

  template:
    metadata:
      labels:
        crawl: {{ id }}
        role: crawler

      {% if force_restart %}
      annotations:
        btrix.crawlForceRestart: "{{ force_restart }}"
      {% endif %}

    spec:
      terminationGracePeriodSeconds: {{ termination_grace_secs }}
      #nodeSelector: {{ crawl_node_selector }}
      volumes:
        - name: crawl-config
          configMap:
            name: crawl-config-{{ cid }}

      affinity:
        nodeAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 1
              preference:
                matchExpressions:
                - key: nodeType
                  operator: In
                  values:
                    - "{{ crawler_node_type }}"

        podAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 2
              podAffinityTerm:
                topologyKey: "failure-domain.beta.kubernetes.io/zone"
                labelSelector:
                  matchLabels:
                    job-name: job-{{ id }}
                    crawl: {{ id }}

      tolerations:
        - key: "nodeType"
          operator: "Equal"
          value: "crawling"
          effect: "NoSchedule"

      containers:
        - name: crawler
          image: {{ crawler_image }}
          imagePullPolicy: {{ crawler_image_pull_policy }}
          command:
            - crawl
            - --config
            - /tmp/crawl-config.json
            - --redisStoreUrl
            - {{ redis_url }}
          {%- if profile_filename %}
            - --profile
            - "@profiles/{{ profile_filename }}"
          {%- endif %}

          volumeMounts:
            - name: crawl-config
              mountPath: /tmp/crawl-config.json
              subPath: crawl-config.json
              readOnly: True

            - name: crawl-data
              mountPath: /crawls

          envFrom:
            - configMapRef:
                name: shared-crawler-config

            - secretRef:
                name: storage-{{ storage_name }}

          env:
            - name: CRAWL_ID
              value: {{ id }}

            - name: WEBHOOK_URL
              value: {{ redis_url }}/crawls-done

            - name: STORE_PATH
              value: {{ store_path }}

            - name: STORE_FILENAME
              value: {{ store_filename }}

            - name: STORE_USER
              value: {{ userid }}

          resources:
            limits:
              cpu: {{ crawler_limits_cpu }}
              memory: {{ crawler_limits_memory }}

            requests:
              cpu: {{ crawler_requests_cpu }}
              memory: {{ crawler_requests_memory }}

          {% if crawler_liveness_port and crawler_liveness_port != '0' %}
          livenessProbe:
            httpGet:
              path: /healthz
              port: {{ crawler_liveness_port }}

            initialDelaySeconds: 15
            periodSeconds: 120
            failureThreshold: 3
          {% endif %}

---
apiVersion: v1
kind: Service
metadata:
  name: crawl-{{ id }}
  labels:
    crawl: {{ id }}
    role: crawler

spec:
  clusterIP: None
  selector:
    crawl: {{ id }}
    role: crawler

  ports:
    - protocol: TCP
      port: 9037
      name: screencast