browsertrix/chart/app-templates/crawler.yaml
Ilya Kreymer 8ea16393c5
Optimize single-page crawl workflows (#2656)
For single page crawls:
- Always force 1 browser to be used, ignoring browser windows/scale
setting
- Don't use custom PVC volumes in crawler / redis, just use emptyDir -
no chance of crawler being interrupted and restarted on different
machine for a single page.

Adds a 'is_single_page' check to CrawlConfig, checking for either limit
or scopeType / no extra hops.

Fixes #2655
2025-06-10 12:13:57 -07:00

254 lines
5.7 KiB
YAML

{% if not no_pvc %}
# -------
# PVC
# -------
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ name }}
namespace: {{ namespace }}
labels:
crawl: {{ id }}
role: crawler
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ storage }}
{% if volume_storage_class %}
storageClassName: {{ volume_storage_class }}
{% endif %}
{% endif %}
# -------
# CRAWLER
# -------
{% if init_crawler %}
---
apiVersion: v1
kind: Pod
metadata:
name: {{ name }}
namespace: {{ namespace }}
labels:
crawl: {{ id }}
role: crawler
network-policy: limit-crawler-egress
spec:
hostname: {{ name }}
subdomain: crawler
{% if priorityClassName %}
priorityClassName: {{ priorityClassName }}
{% endif %}
restartPolicy: OnFailure
securityContext:
runAsNonRoot: true
runAsUser: {{ crawler_uid }}
runAsGroup: {{ crawler_gid }}
fsGroup: {{ crawler_fsgroup }}
terminationGracePeriodSeconds: {{ termination_grace_secs }}
volumes:
- name: crawl-config
configMap:
name: crawl-config-{{ id }}
{% if qa_source_crawl_id %}
- name: qa-config
configMap:
name: qa-replay-{{ qa_source_crawl_id }}
{% endif %}
- name: crawl-data
{% if not no_pvc %}
persistentVolumeClaim:
claimName: {{ name }}
{% else %}
emptyDir: {}
{% endif %}
{% if proxy_id %}
- name: proxies
secret:
secretName: proxies
defaultMode: 0600
- name: force-user-and-group-name
secret:
secretName: force-user-and-group-name
defaultMode: 0600
{% endif %}
affinity:
{% if crawler_node_type %}
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nodeType
operator: In
values:
- "{{ crawler_node_type }}"
{% endif %}
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 10
podAffinityTerm:
topologyKey: "kubernetes.io/hostname"
labelSelector:
matchExpressions:
- key: crawl
operator: In
values:
- {{ id }}
tolerations:
- key: nodeType
operator: Equal
value: crawling
effect: NoSchedule
- key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
effect: NoExecute
- key: node.kubernetes.io/unreachable
operator: Exists
effect: NoExecute
tolerationSeconds: 300
containers:
- name: crawler
image: {{ crawler_image }}
imagePullPolicy: {{ crawler_image_pull_policy }}
command:
- {{ "crawl" if not qa_source_crawl_id else "qa" }}
- --config
- /tmp/config/crawl-config.json
- --workers
- "{{ workers }}"
- --redisStoreUrl
- {{ redis_url }}
{% if qa_source_crawl_id %}
- --qaSource
- /tmp/qa/qa-config.json
{% elif profile_filename %}
- --profile
- "@{{ profile_filename }}"
{% endif %}
{% if proxy_id %}
- --proxyServer
- "{{ proxy_url }}"
{% if proxy_ssh_private_key %}
- --sshProxyPrivateKeyFile
- /tmp/ssh-proxy/private-key
{% endif %}
{% if proxy_ssh_host_public_key %}
- --sshProxyKnownHostsFile
- /tmp/ssh-proxy/known-hosts
{% endif %}
{% endif %}
volumeMounts:
- name: crawl-config
mountPath: /tmp/config/
readOnly: True
{% if qa_source_crawl_id %}
- name: qa-config
mountPath: /tmp/qa/
readOnly: True
{% endif %}
{% if proxy_id %}
{% if proxy_ssh_private_key %}
- name: proxies
mountPath: /tmp/ssh-proxy/private-key
subPath: {{ proxy_id }}-private-key
readOnly: true
{% endif %}
{% if proxy_ssh_host_public_key %}
- name: proxies
mountPath: /tmp/ssh-proxy/known-hosts
subPath: {{ proxy_id }}-known-hosts
readOnly: true
{% endif %}
- name: force-user-and-group-name
mountPath: /etc/passwd
subPath: passwd
readOnly: true
- name: force-user-and-group-name
mountPath: /etc/group
subPath: group
readOnly: true
{% endif %}
- name: crawl-data
mountPath: /crawls
- name: crawl-data
mountPath: /tmp
subPath: tmp
envFrom:
- configMapRef:
name: shared-crawler-config
- secretRef:
name: {{ storage_secret }}
{% if signing_secret %}
- secretRef:
name: {{ signing_secret }}
{% endif %}
env:
- name: HOME
value: /crawls/home
- name: CRAWL_ID
value: "{{ id }}"
- name: WEBHOOK_URL
value: "{{ redis_url }}/crawls-done"
- name: STORE_PATH
value: "{{ storage_path }}"
- name: STORE_FILENAME
value: "{{ storage_filename }}"
- name: STORE_USER
value: "{{ userid }}"
- name: WARC_PREFIX
value: "{{ warc_prefix }}"
resources:
limits:
memory: "{{ memory_limit }}"
requests:
cpu: "{{ cpu }}"
memory: "{{ memory }}"
{% if crawler_liveness_port and crawler_liveness_port != '0' %}
livenessProbe:
httpGet:
path: /healthz
port: {{ crawler_liveness_port }}
initialDelaySeconds: 15
periodSeconds: 120
failureThreshold: 3
{% endif %}
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
{% endif %}