Resolves #1354 Supports crawling through pre-configured proxy servers, allowing users to select which proxy servers to use (requires browsertrix crawler 1.3+) Config: - proxies defined in btrix-proxies subchart - can be configured via btrix-proxies key or separate proxies.yaml file via separate subchart - proxies list refreshed automatically if crawler_proxies.json changes if subchart is deployed - support for ssh and socks5 proxies - proxy keys added to secrets in subchart - support for default proxy to be always used if no other proxy configured, prevent starting cluster if default proxy not available - prevent starting manual crawl if previously configured proxy is no longer available, return error - force 'btrix' username and group name on browsertrix-crawler non-root user to support ssh Operator: - support crawling through proxies, pass proxyId in CrawlJob - support running profile browsers which designated proxy, pass proxyId to ProfileJob - prevent starting scheduled crawl if previously configured proxy is no longer available API / Access: - /api/orgs/all/crawlconfigs/crawler-proxies - get all proxies (superadmin only) - /api/orgs/{oid}/crawlconfigs/crawler-proxies - get proxies available to particular org - /api/orgs/{oid}/proxies - update allowed proxies for particular org (superadmin only) - superadmin can configure which orgs can use which proxies, stored on the org - superadmin can also allow an org to access all 'shared' proxies, to avoid having to allow a shared proxy on each org. UI: - Superadmin has 'Edit Proxies' dialog to configure for each org if it has: dedicated proxies, has access to shared proxies. - User can select a proxy in Crawl Workflow browser settings - Users can choose to launch a browser profile with a particular proxy - Display which proxy is used to create profile in profile selector - Users can choose with default proxy to use for new workflows in Crawling Defaults --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com> Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
		
			
				
	
	
		
			152 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			152 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
| ---
 | |
| apiVersion: v1
 | |
| kind: Pod
 | |
| metadata:
 | |
|   name: browser-{{ id }}
 | |
|   namespace: {{ namespace }}
 | |
|   labels:
 | |
|     browser: {{ id }}
 | |
|     role: browser
 | |
|     network-policy: limit-crawler-egress
 | |
| 
 | |
| spec:
 | |
|   hostname: browser-{{ id }}
 | |
|   subdomain: browser
 | |
| 
 | |
|   securityContext:
 | |
|     runAsNonRoot: true
 | |
|     runAsUser: {{ crawler_uid}}
 | |
|     runAsGroup: {{ crawler_gid}}
 | |
|     fsGroup: {{ crawler_fsgroup }}
 | |
|     allowPrivilegeEscalation: false
 | |
|     readOnlyRootFilesystem: true
 | |
| 
 | |
|   volumes:
 | |
|     - name: crawler-workdir
 | |
|       emptyDir:
 | |
|         sizeLimit: {{ profile_browser_workdir_size }}
 | |
| 
 | |
|     {% if proxy_id %}
 | |
|     - name: proxies
 | |
|       secret:
 | |
|         secretName: proxies
 | |
|         defaultMode: 0600
 | |
|     - name: force-user-and-group-name
 | |
|       secret:
 | |
|         secretName: force-user-and-group-name
 | |
|         defaultMode: 0600
 | |
|     {% endif %}
 | |
| 
 | |
|   {% if priorityClassName %}
 | |
|   priorityClassName: {{ priorityClassName }}
 | |
|   {% endif %}
 | |
| 
 | |
|   restartPolicy: OnFailure
 | |
| 
 | |
| {% if crawler_node_type %}
 | |
|   affinity:
 | |
|     nodeAffinity:
 | |
|       requiredDuringSchedulingIgnoredDuringExecution:
 | |
|         nodeSelectorTerms:
 | |
|           - matchExpressions:
 | |
|             - key: nodeType
 | |
|               operator: In
 | |
|               values:
 | |
|                 - "{{ crawler_node_type }}"
 | |
| {% endif %}
 | |
| 
 | |
|   tolerations:
 | |
|     - key: nodeType
 | |
|       operator: Equal
 | |
|       value: crawling
 | |
|       effect: NoSchedule
 | |
|     - key: node.kubernetes.io/not-ready
 | |
|       operator: Exists
 | |
|       tolerationSeconds: 300
 | |
|       effect: NoExecute
 | |
|     - key: node.kubernetes.io/unreachable
 | |
|       operator: Exists
 | |
|       effect: NoExecute
 | |
|       tolerationSeconds: 300
 | |
| 
 | |
|   containers:
 | |
|     - name: browser
 | |
|       image: {{ crawler_image }}
 | |
|       imagePullPolicy: {{ crawler_image_pull_policy }}
 | |
|       command:
 | |
|         - create-login-profile
 | |
|         - --interactive
 | |
|         - --filename
 | |
|         - /tmp/profile.tar.gz
 | |
|         - --url
 | |
|         - {{ url }}
 | |
|       {%- if profile_filename %}
 | |
|         - --profile
 | |
|         - "@{{ profile_filename }}"
 | |
|       {%- endif %}
 | |
|       {% if proxy_id %}
 | |
|         - --proxyServer
 | |
|         - "{{ proxy_url }}"
 | |
|       {% if proxy_ssh_private_key %}
 | |
|         - --sshProxyPrivateKeyFile
 | |
|         - /tmp/ssh-proxy/private-key
 | |
|       {% endif %}
 | |
|       {% if proxy_ssh_host_public_key %}
 | |
|         - --sshProxyKnownHostsFile
 | |
|         - /tmp/ssh-proxy/known-hosts
 | |
|       {% endif %}
 | |
|       {% endif %}
 | |
| 
 | |
|       volumeMounts:
 | |
|         - name: crawler-workdir
 | |
|           mountPath: /tmp/home
 | |
|       {% if proxy_id %}
 | |
|       {% if proxy_ssh_private_key %}
 | |
|         - name: proxies
 | |
|           mountPath: /tmp/ssh-proxy/private-key
 | |
|           subPath: {{ proxy_id }}-private-key
 | |
|           readOnly: true
 | |
|       {% endif %}
 | |
|       {% if proxy_ssh_host_public_key %}
 | |
|         - name: proxies
 | |
|           mountPath: /tmp/ssh-proxy/known-hosts
 | |
|           subPath: {{ proxy_id }}-known-hosts
 | |
|           readOnly: true
 | |
|       {% endif %}
 | |
|         - name: force-user-and-group-name
 | |
|           mountPath: /etc/passwd
 | |
|           subPath: passwd
 | |
|           readOnly: true
 | |
|         - name: force-user-and-group-name
 | |
|           mountPath: /etc/group
 | |
|           subPath: group
 | |
|           readOnly: true
 | |
|        {% endif %}
 | |
| 
 | |
|       envFrom:
 | |
|         - secretRef:
 | |
|             name: {{ storage_secret }}
 | |
| 
 | |
|       env:
 | |
|         - name: HOME
 | |
|           value: /tmp/home
 | |
| 
 | |
|         - name: STORE_PATH
 | |
|           value: {{ storage_path }}
 | |
| 
 | |
|         - name: VNC_PASS
 | |
|           value: {{ vnc_password }}
 | |
| 
 | |
|       {% if crawler_socks_proxy_host %}
 | |
|         - name: CHROME_FLAGS
 | |
|           value: "--proxy-server=socks5://{{ crawler_socks_proxy_host }}:{{ crawler_socks_proxy_port | default('9050') }}"
 | |
|       {% endif %}
 | |
| 
 | |
|       resources:
 | |
|         limits:
 | |
|           memory: "{{ profile_memory }}"
 | |
| 
 | |
|         requests:
 | |
|           cpu: "{{ profile_cpu }}"
 | |
|           memory: "{{ profile_memory }}"
 |