- use python-on-whale to use docker cli api directly, creating docker stack for each crawl or profile browser - configure storages via storages.yaml secret - add crawl_job, profile_job, splitting into base and k8s/swarm implementations - split manager into base crawlmanager and k8s/swarm implementations - swarm: load initial scale from db to avoid modifying fixed configs, in k8s, load from configmap - swarm: support scheduled jobs via swarm-cronjob service - remove docker dependencies (aiodocker, apscheduler, scheduling) - swarm: when using local minio, expose via /data/ route in nginx via extra include (in k8s, include dir is empty and routing handled via ingress) - k8s: cleanup minio chart: move init containers to minio.yaml - swarm: stateful set implementation to be consistent with k8s scaling: - don't use service replicas, - create a unique service with '-N' appended and allocate unique volume for each replica - allows crawl containers to be restarted w/o losing data - add volume pruning background service, as volumes can be deleted only after service shuts down fully - watch: fully simplify routing, route via replica index instead of ip for both k8s and swarm - rename network btrix-cloud-net -> btrix-net to avoid conflict with compose network
98 lines
1.8 KiB
YAML
98 lines
1.8 KiB
YAML
version: '3.9'
|
|
|
|
services:
|
|
crawler:
|
|
image: {{ crawler_image }}
|
|
command:
|
|
- crawl
|
|
- --config
|
|
- /crawlconfig.json
|
|
- --redisStoreUrl
|
|
- {{ redis_url }}
|
|
{%- if profile_filename %}
|
|
- --profile
|
|
- "@profiles/{{ profile_filename }}"
|
|
{%- endif %}
|
|
|
|
hostname: "crawl-{{ id }}-{{ index }}"
|
|
|
|
networks:
|
|
- btrix
|
|
|
|
configs:
|
|
- crawlconfig.json
|
|
|
|
volumes:
|
|
- crawl-data:/crawls
|
|
|
|
stop_grace_period: 1000s
|
|
|
|
deploy:
|
|
endpoint_mode: dnsrr
|
|
replicas: 1
|
|
labels:
|
|
crawl: {{ id }}
|
|
role: crawler
|
|
|
|
resources:
|
|
limits:
|
|
cpus: "{{ crawler_limits_cpu }}"
|
|
memory: "{{ crawler_limits_memory }}"
|
|
reservations:
|
|
cpus: "{{ crawler_requests_cpu }}"
|
|
memory: "{{ crawler_requests_memory }}"
|
|
|
|
environment:
|
|
- CRAWL_ID={{ id }}
|
|
|
|
- STORE_ENDPOINT_URL={{ endpoint_url }}
|
|
- STORE_ACCESS_KEY={{ access_key }}
|
|
- STORE_SECRET_KEY={{ secret_key }}
|
|
|
|
- STORE_PATH={{ storage_path }}
|
|
- STORE_FILENAME={{ storage_filename }}
|
|
- STORE_USER={{ userid }}
|
|
|
|
{%- if auth_token %}
|
|
- WACZ_SIGN_TOKEN={{ auth_token }}
|
|
- WACZ_SIGN_URL=http://authsign:5053/sign
|
|
{%- endif %}
|
|
|
|
- WEBHOOK_URL={{ redis_url }}/crawls-done
|
|
- CRAWL_ARGS={{ crawler_args }}
|
|
|
|
{% if index == 0 %}
|
|
redis:
|
|
image: {{ redis_image }}
|
|
command: ["redis-server", "--appendonly", "yes"]
|
|
|
|
deploy:
|
|
endpoint_mode: dnsrr
|
|
replicas: 1
|
|
labels:
|
|
crawl: {{ id }}
|
|
role: redis
|
|
|
|
networks:
|
|
- btrix
|
|
|
|
{% endif %}
|
|
|
|
networks:
|
|
btrix:
|
|
external:
|
|
name: btrix-net
|
|
|
|
configs:
|
|
crawlconfig.json:
|
|
external: true
|
|
name: crawl-config-{{ cid }}
|
|
|
|
volumes:
|
|
crawl-data:
|
|
name: "crawl-{{ id }}-{{ index }}"
|
|
labels:
|
|
btrix.crawl: {{ id }}
|
|
|
|
|