Resolves #1354 Supports crawling through pre-configured proxy servers, allowing users to select which proxy servers to use (requires browsertrix crawler 1.3+) Config: - proxies defined in btrix-proxies subchart - can be configured via btrix-proxies key or separate proxies.yaml file via separate subchart - proxies list refreshed automatically if crawler_proxies.json changes if subchart is deployed - support for ssh and socks5 proxies - proxy keys added to secrets in subchart - support for default proxy to be always used if no other proxy configured, prevent starting cluster if default proxy not available - prevent starting manual crawl if previously configured proxy is no longer available, return error - force 'btrix' username and group name on browsertrix-crawler non-root user to support ssh Operator: - support crawling through proxies, pass proxyId in CrawlJob - support running profile browsers which designated proxy, pass proxyId to ProfileJob - prevent starting scheduled crawl if previously configured proxy is no longer available API / Access: - /api/orgs/all/crawlconfigs/crawler-proxies - get all proxies (superadmin only) - /api/orgs/{oid}/crawlconfigs/crawler-proxies - get proxies available to particular org - /api/orgs/{oid}/proxies - update allowed proxies for particular org (superadmin only) - superadmin can configure which orgs can use which proxies, stored on the org - superadmin can also allow an org to access all 'shared' proxies, to avoid having to allow a shared proxy on each org. UI: - Superadmin has 'Edit Proxies' dialog to configure for each org if it has: dedicated proxies, has access to shared proxies. - User can select a proxy in Crawl Workflow browser settings - Users can choose to launch a browser profile with a particular proxy - Display which proxy is used to create profile in profile selector - Users can choose with default proxy to use for new workflows in Crawling Defaults --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com> Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
		
			
				
	
	
		
			243 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			243 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
# -------
 | 
						|
# PVC
 | 
						|
# -------
 | 
						|
 | 
						|
apiVersion: v1
 | 
						|
kind: PersistentVolumeClaim
 | 
						|
metadata:
 | 
						|
  name: {{ name }}
 | 
						|
  namespace: {{ namespace }}
 | 
						|
  labels:
 | 
						|
    crawl: {{ id }}
 | 
						|
    role: crawler
 | 
						|
 | 
						|
spec:
 | 
						|
  accessModes:
 | 
						|
    - ReadWriteOnce
 | 
						|
 | 
						|
  resources:
 | 
						|
    requests:
 | 
						|
      storage: {{ storage }}
 | 
						|
 | 
						|
  {% if volume_storage_class %}
 | 
						|
  storageClassName: {{ volume_storage_class }}
 | 
						|
  {% endif %}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
# -------
 | 
						|
# CRAWLER
 | 
						|
# -------
 | 
						|
{% if not do_restart %}
 | 
						|
---
 | 
						|
apiVersion: v1
 | 
						|
kind: Pod
 | 
						|
metadata:
 | 
						|
  name: {{ name }}
 | 
						|
  namespace: {{ namespace }}
 | 
						|
  labels:
 | 
						|
    crawl: {{ id }}
 | 
						|
    role: crawler
 | 
						|
    network-policy: limit-crawler-egress
 | 
						|
 | 
						|
spec:
 | 
						|
  hostname: {{ name }}
 | 
						|
  subdomain: crawler
 | 
						|
 | 
						|
  {% if priorityClassName %}
 | 
						|
  priorityClassName: {{ priorityClassName }}
 | 
						|
  {% endif %}
 | 
						|
 | 
						|
  restartPolicy: OnFailure
 | 
						|
 | 
						|
  securityContext:
 | 
						|
    runAsNonRoot: true
 | 
						|
    runAsUser: {{ crawler_uid }}
 | 
						|
    runAsGroup: {{ crawler_gid }}
 | 
						|
    fsGroup: {{ crawler_fsgroup }}
 | 
						|
    allowPrivilegeEscalation: false
 | 
						|
    readOnlyRootFilesystem: true
 | 
						|
 | 
						|
  terminationGracePeriodSeconds: {{ termination_grace_secs }}
 | 
						|
  volumes:
 | 
						|
    - name: crawl-config
 | 
						|
      configMap:
 | 
						|
        name: crawl-config-{{ id }}
 | 
						|
    {% if qa_source_crawl_id %}
 | 
						|
    - name: qa-config
 | 
						|
      configMap:
 | 
						|
        name: qa-replay-{{ qa_source_crawl_id }}
 | 
						|
    {% endif %}
 | 
						|
    - name: crawl-data
 | 
						|
      persistentVolumeClaim:
 | 
						|
        claimName: {{ name }}
 | 
						|
    {% if proxy_id %}
 | 
						|
    - name: proxies
 | 
						|
      secret:
 | 
						|
        secretName: proxies
 | 
						|
        defaultMode: 0600
 | 
						|
    - name: force-user-and-group-name
 | 
						|
      secret:
 | 
						|
        secretName: force-user-and-group-name
 | 
						|
        defaultMode: 0600
 | 
						|
    {% endif %}
 | 
						|
 | 
						|
  affinity:
 | 
						|
{% if crawler_node_type %}
 | 
						|
    nodeAffinity:
 | 
						|
      requiredDuringSchedulingIgnoredDuringExecution:
 | 
						|
        nodeSelectorTerms:
 | 
						|
          - matchExpressions:
 | 
						|
            - key: nodeType
 | 
						|
              operator: In
 | 
						|
              values:
 | 
						|
                - "{{ crawler_node_type }}"
 | 
						|
{% endif %}
 | 
						|
 | 
						|
    podAffinity:
 | 
						|
      preferredDuringSchedulingIgnoredDuringExecution:
 | 
						|
        - weight: 10
 | 
						|
          podAffinityTerm:
 | 
						|
            topologyKey: "kubernetes.io/hostname"
 | 
						|
            labelSelector:
 | 
						|
              matchExpressions:
 | 
						|
              - key: crawl
 | 
						|
                operator: In
 | 
						|
                values:
 | 
						|
                - {{ id }}
 | 
						|
 | 
						|
  tolerations:
 | 
						|
    - key: nodeType
 | 
						|
      operator: Equal
 | 
						|
      value: crawling
 | 
						|
      effect: NoSchedule
 | 
						|
    - key: node.kubernetes.io/not-ready
 | 
						|
      operator: Exists
 | 
						|
      tolerationSeconds: 300
 | 
						|
      effect: NoExecute
 | 
						|
    - key: node.kubernetes.io/unreachable
 | 
						|
      operator: Exists
 | 
						|
      effect: NoExecute
 | 
						|
      tolerationSeconds: 300
 | 
						|
 | 
						|
  containers:
 | 
						|
    - name: crawler
 | 
						|
      image: {{ crawler_image }}
 | 
						|
      imagePullPolicy: {{ crawler_image_pull_policy }}
 | 
						|
      command:
 | 
						|
        - {{ "crawl" if not qa_source_crawl_id else "qa" }}
 | 
						|
        - --config
 | 
						|
        - /tmp/crawl-config.json
 | 
						|
        - --workers
 | 
						|
        - "{{ workers }}"
 | 
						|
        - --redisStoreUrl
 | 
						|
        - {{ redis_url }}
 | 
						|
      {% if qa_source_crawl_id %}
 | 
						|
        - --qaSource
 | 
						|
        - /tmp/qa/qa-config.json
 | 
						|
      {% elif profile_filename %}
 | 
						|
        - --profile
 | 
						|
        - "@{{ profile_filename }}"
 | 
						|
      {% endif %}
 | 
						|
      {% if proxy_id %}
 | 
						|
        - --proxyServer
 | 
						|
        - "{{ proxy_url }}"
 | 
						|
      {% if proxy_ssh_private_key %}
 | 
						|
        - --sshProxyPrivateKeyFile
 | 
						|
        - /tmp/ssh-proxy/private-key
 | 
						|
      {% endif %}
 | 
						|
      {% if proxy_ssh_host_public_key %}
 | 
						|
        - --sshProxyKnownHostsFile
 | 
						|
        - /tmp/ssh-proxy/known-hosts
 | 
						|
      {% endif %}
 | 
						|
      {% endif %}
 | 
						|
      volumeMounts:
 | 
						|
        - name: crawl-config
 | 
						|
          mountPath: /tmp/crawl-config.json
 | 
						|
          subPath: crawl-config.json
 | 
						|
          readOnly: True
 | 
						|
 | 
						|
      {% if qa_source_crawl_id %}
 | 
						|
        - name: qa-config
 | 
						|
          mountPath: /tmp/qa/
 | 
						|
          readOnly: True
 | 
						|
      {% endif %}
 | 
						|
      {% if proxy_id %}
 | 
						|
      {% if proxy_ssh_private_key %}
 | 
						|
        - name: proxies
 | 
						|
          mountPath: /tmp/ssh-proxy/private-key
 | 
						|
          subPath: {{ proxy_id }}-private-key
 | 
						|
          readOnly: true
 | 
						|
      {% endif %}
 | 
						|
      {% if proxy_ssh_host_public_key %}
 | 
						|
        - name: proxies
 | 
						|
          mountPath: /tmp/ssh-proxy/known-hosts
 | 
						|
          subPath: {{ proxy_id }}-known-hosts
 | 
						|
          readOnly: true
 | 
						|
      {% endif %}
 | 
						|
        - name: force-user-and-group-name
 | 
						|
          mountPath: /etc/passwd
 | 
						|
          subPath: passwd
 | 
						|
          readOnly: true
 | 
						|
        - name: force-user-and-group-name
 | 
						|
          mountPath: /etc/group
 | 
						|
          subPath: group
 | 
						|
          readOnly: true
 | 
						|
       {% endif %}
 | 
						|
        - name: crawl-data
 | 
						|
          mountPath: /crawls
 | 
						|
      envFrom:
 | 
						|
        - configMapRef:
 | 
						|
            name: shared-crawler-config
 | 
						|
 | 
						|
        - secretRef:
 | 
						|
            name: {{ storage_secret }}
 | 
						|
 | 
						|
      {% if signing_secret %}
 | 
						|
        - secretRef:
 | 
						|
            name: {{ signing_secret }}
 | 
						|
      {% endif %}
 | 
						|
 | 
						|
      env:
 | 
						|
        - name: HOME
 | 
						|
          value: /crawls/home
 | 
						|
 | 
						|
        - name: CRAWL_ID
 | 
						|
          value: "{{ id }}"
 | 
						|
 | 
						|
        - name: WEBHOOK_URL
 | 
						|
          value: "{{ redis_url }}/crawls-done"
 | 
						|
 | 
						|
        - name: STORE_PATH
 | 
						|
          value: "{{ storage_path }}"
 | 
						|
 | 
						|
        - name: STORE_FILENAME
 | 
						|
          value: "{{ storage_filename }}"
 | 
						|
 | 
						|
        - name: STORE_USER
 | 
						|
          value: "{{ userid }}"
 | 
						|
 | 
						|
        - name: WARC_PREFIX
 | 
						|
          value: "{{ warc_prefix }}"
 | 
						|
 | 
						|
      resources:
 | 
						|
        limits:
 | 
						|
          memory: "{{ memory_limit }}"
 | 
						|
 | 
						|
        requests:
 | 
						|
          cpu: "{{ cpu }}"
 | 
						|
          memory: "{{ memory }}"
 | 
						|
 | 
						|
      {% if crawler_liveness_port and crawler_liveness_port != '0' %}
 | 
						|
      livenessProbe:
 | 
						|
        httpGet:
 | 
						|
          path: /healthz
 | 
						|
          port: {{ crawler_liveness_port }}
 | 
						|
 | 
						|
        initialDelaySeconds: 15
 | 
						|
        periodSeconds: 120
 | 
						|
        failureThreshold: 3
 | 
						|
      {% endif %}
 | 
						|
 | 
						|
{% endif %}
 |