Resolves #1354 Supports crawling through pre-configured proxy servers, allowing users to select which proxy servers to use (requires browsertrix crawler 1.3+) Config: - proxies defined in btrix-proxies subchart - can be configured via btrix-proxies key or separate proxies.yaml file via separate subchart - proxies list refreshed automatically if crawler_proxies.json changes if subchart is deployed - support for ssh and socks5 proxies - proxy keys added to secrets in subchart - support for default proxy to be always used if no other proxy configured, prevent starting cluster if default proxy not available - prevent starting manual crawl if previously configured proxy is no longer available, return error - force 'btrix' username and group name on browsertrix-crawler non-root user to support ssh Operator: - support crawling through proxies, pass proxyId in CrawlJob - support running profile browsers which designated proxy, pass proxyId to ProfileJob - prevent starting scheduled crawl if previously configured proxy is no longer available API / Access: - /api/orgs/all/crawlconfigs/crawler-proxies - get all proxies (superadmin only) - /api/orgs/{oid}/crawlconfigs/crawler-proxies - get proxies available to particular org - /api/orgs/{oid}/proxies - update allowed proxies for particular org (superadmin only) - superadmin can configure which orgs can use which proxies, stored on the org - superadmin can also allow an org to access all 'shared' proxies, to avoid having to allow a shared proxy on each org. UI: - Superadmin has 'Edit Proxies' dialog to configure for each org if it has: dedicated proxies, has access to shared proxies. - User can select a proxy in Crawl Workflow browser settings - Users can choose to launch a browser profile with a particular proxy - Display which proxy is used to create profile in profile selector - Users can choose with default proxy to use for new workflows in Crawling Defaults --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com> Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
152 lines
3.6 KiB
YAML
152 lines
3.6 KiB
YAML
---
|
|
apiVersion: v1
|
|
kind: Pod
|
|
metadata:
|
|
name: browser-{{ id }}
|
|
namespace: {{ namespace }}
|
|
labels:
|
|
browser: {{ id }}
|
|
role: browser
|
|
network-policy: limit-crawler-egress
|
|
|
|
spec:
|
|
hostname: browser-{{ id }}
|
|
subdomain: browser
|
|
|
|
securityContext:
|
|
runAsNonRoot: true
|
|
runAsUser: {{ crawler_uid}}
|
|
runAsGroup: {{ crawler_gid}}
|
|
fsGroup: {{ crawler_fsgroup }}
|
|
allowPrivilegeEscalation: false
|
|
readOnlyRootFilesystem: true
|
|
|
|
volumes:
|
|
- name: crawler-workdir
|
|
emptyDir:
|
|
sizeLimit: {{ profile_browser_workdir_size }}
|
|
|
|
{% if proxy_id %}
|
|
- name: proxies
|
|
secret:
|
|
secretName: proxies
|
|
defaultMode: 0600
|
|
- name: force-user-and-group-name
|
|
secret:
|
|
secretName: force-user-and-group-name
|
|
defaultMode: 0600
|
|
{% endif %}
|
|
|
|
{% if priorityClassName %}
|
|
priorityClassName: {{ priorityClassName }}
|
|
{% endif %}
|
|
|
|
restartPolicy: OnFailure
|
|
|
|
{% if crawler_node_type %}
|
|
affinity:
|
|
nodeAffinity:
|
|
requiredDuringSchedulingIgnoredDuringExecution:
|
|
nodeSelectorTerms:
|
|
- matchExpressions:
|
|
- key: nodeType
|
|
operator: In
|
|
values:
|
|
- "{{ crawler_node_type }}"
|
|
{% endif %}
|
|
|
|
tolerations:
|
|
- key: nodeType
|
|
operator: Equal
|
|
value: crawling
|
|
effect: NoSchedule
|
|
- key: node.kubernetes.io/not-ready
|
|
operator: Exists
|
|
tolerationSeconds: 300
|
|
effect: NoExecute
|
|
- key: node.kubernetes.io/unreachable
|
|
operator: Exists
|
|
effect: NoExecute
|
|
tolerationSeconds: 300
|
|
|
|
containers:
|
|
- name: browser
|
|
image: {{ crawler_image }}
|
|
imagePullPolicy: {{ crawler_image_pull_policy }}
|
|
command:
|
|
- create-login-profile
|
|
- --interactive
|
|
- --filename
|
|
- /tmp/profile.tar.gz
|
|
- --url
|
|
- {{ url }}
|
|
{%- if profile_filename %}
|
|
- --profile
|
|
- "@{{ profile_filename }}"
|
|
{%- endif %}
|
|
{% if proxy_id %}
|
|
- --proxyServer
|
|
- "{{ proxy_url }}"
|
|
{% if proxy_ssh_private_key %}
|
|
- --sshProxyPrivateKeyFile
|
|
- /tmp/ssh-proxy/private-key
|
|
{% endif %}
|
|
{% if proxy_ssh_host_public_key %}
|
|
- --sshProxyKnownHostsFile
|
|
- /tmp/ssh-proxy/known-hosts
|
|
{% endif %}
|
|
{% endif %}
|
|
|
|
volumeMounts:
|
|
- name: crawler-workdir
|
|
mountPath: /tmp/home
|
|
{% if proxy_id %}
|
|
{% if proxy_ssh_private_key %}
|
|
- name: proxies
|
|
mountPath: /tmp/ssh-proxy/private-key
|
|
subPath: {{ proxy_id }}-private-key
|
|
readOnly: true
|
|
{% endif %}
|
|
{% if proxy_ssh_host_public_key %}
|
|
- name: proxies
|
|
mountPath: /tmp/ssh-proxy/known-hosts
|
|
subPath: {{ proxy_id }}-known-hosts
|
|
readOnly: true
|
|
{% endif %}
|
|
- name: force-user-and-group-name
|
|
mountPath: /etc/passwd
|
|
subPath: passwd
|
|
readOnly: true
|
|
- name: force-user-and-group-name
|
|
mountPath: /etc/group
|
|
subPath: group
|
|
readOnly: true
|
|
{% endif %}
|
|
|
|
envFrom:
|
|
- secretRef:
|
|
name: {{ storage_secret }}
|
|
|
|
env:
|
|
- name: HOME
|
|
value: /tmp/home
|
|
|
|
- name: STORE_PATH
|
|
value: {{ storage_path }}
|
|
|
|
- name: VNC_PASS
|
|
value: {{ vnc_password }}
|
|
|
|
{% if crawler_socks_proxy_host %}
|
|
- name: CHROME_FLAGS
|
|
value: "--proxy-server=socks5://{{ crawler_socks_proxy_host }}:{{ crawler_socks_proxy_port | default('9050') }}"
|
|
{% endif %}
|
|
|
|
resources:
|
|
limits:
|
|
memory: "{{ profile_memory }}"
|
|
|
|
requests:
|
|
cpu: "{{ profile_cpu }}"
|
|
memory: "{{ profile_memory }}"
|