Misc backend fixes for cloud deployment (#26)

* misc backend fixes:
- fix running w/o local minio
- ensure crawler image pull policy is configurable, loaded via chart value
- use digitalocean repo for main backend image (for now)
- add bucket_name to config only if using default bucket

* enable all behaviors, support 'access_endpoint_url' for default storages

* debugging: add 'no_delete_jobs' setting for k8s and docker to disable deletion of completed jobs
This commit is contained in:
Ilya Kreymer 2021-11-25 11:58:26 -08:00 committed by GitHub
parent 58eba70c68
commit 3d4d7049a2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 52 additions and 30 deletions

View File

@ -66,7 +66,7 @@ class RawCrawlConfig(BaseModel):
combineWARC: Optional[bool] = False
logging: Optional[str] = ""
behaviors: Optional[str] = "autoscroll"
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
# ============================================================================

View File

@ -44,6 +44,8 @@ class DockerManager:
self.extra_crawl_params = extra_crawl_params or []
self._event_q = None
self.no_delete_on_fail = os.environ.get("NO_DELETE_ON_FAIL", "")
self.storages = {
"default": S3Storage(
name="default",
@ -111,7 +113,8 @@ class DockerManager:
for container in results:
print(f"Cleaning Up Orphan Container {container['Id']}", flush=True)
await container.delete()
if not self.no_delete_on_fail:
await container.delete()
results = await self.client.containers.list(
filters=json.dumps(
@ -482,8 +485,10 @@ class DockerManager:
if actor["Attributes"]["exitCode"] != 0:
crawl = self._make_crawl_for_container(container, "failed", True)
await self.crawl_ops.store_crawl(crawl)
await container.delete()
if not self.no_delete_on_fail:
await container.delete()
else:
await container.delete()
# pylint: disable=no-self-use,too-many-arguments
def _make_crawl_for_container(self, container, state, finish_now=False):

View File

@ -37,11 +37,13 @@ class K8SManager:
self.namespace = namespace
self._default_storage_endpoints = {}
self.crawler_image = os.environ.get("CRAWLER_IMAGE")
self.crawler_image_pull_policy = "IfNotPresent"
self.crawler_image = os.environ["CRAWLER_IMAGE"]
self.crawler_image_pull_policy = os.environ["CRAWLER_PULL_POLICY"]
self.crawl_retries = int(os.environ.get("CRAWL_RETRIES", "3"))
self.no_delete_jobs = os.environ.get("NO_DELETE_JOBS", "0") != "0"
self.loop = asyncio.get_running_loop()
self.loop.create_task(self.run_event_loop())
@ -319,7 +321,7 @@ class K8SManager:
return None, None
manual = job.metadata.annotations.get("btrix.run.manual") == "1"
if manual:
if manual and not self.no_delete_jobs:
self.loop.create_task(self._delete_job(job.metadata.name))
crawl = self._make_crawl_for_job(
@ -457,7 +459,7 @@ class K8SManager:
failure = await self.crawl_ops.store_crawl(crawl)
# keep failed jobs around, for now
if not failure:
if not failure and not self.no_delete_jobs:
await self._delete_job(job_name)
# ========================================================================
@ -643,7 +645,7 @@ class K8SManager:
{
"name": "crawler",
"image": self.crawler_image,
"imagePullPolicy": "Never",
"imagePullPolicy": self.crawler_image_pull_policy,
"command": [
"crawl",
"--config",

View File

@ -10,6 +10,7 @@ data:
CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }}
CRAWLER_IMAGE: {{ .Values.crawler_image }}
CRAWLER_PULL_POLICY: {{ .Values.crawler_pull_policy }}
CRAWL_TIMEOUT: "{{ .Values.crawl_timeout }}"
CRAWL_RETRIES: "{{ .Values.crawl_retries }}"
@ -18,6 +19,8 @@ data:
REDIS_CRAWLS_DONE_KEY: "crawls-done"
NO_DELETE_JOBS: "{{ .Values.no_delete_jobs | default '0' }}"
---
apiVersion: v1
kind: ConfigMap

View File

@ -29,8 +29,8 @@ spec:
rules:
- host: {{ .Values.ingress.host }}
http:
{{- if .Values.minio_local }}
paths:
{{- if .Values.minio_local }}
- path: /data/(.*)
pathType: Prefix
backend:
@ -51,7 +51,7 @@ spec:
{{ if .Values.ingress.tls }}
---
apiVersion: cert-manager.io/v1alpha2
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: cert-main

View File

@ -31,8 +31,8 @@ spec:
- name: nginx-resolver
emptyDir: {}
{{- if .Values.minio_local }}
initContainers:
{{- if .Values.minio_local }}
- name: init-bucket
image: {{ .Values.minio_mc_image }}
imagePullPolicy: {{ .Values.minio_pull_policy }}
@ -44,7 +44,7 @@ spec:
key: MC_HOST
command: ['/bin/sh']
args: ['-c', 'mc mb local/test-bucket; mc policy set public local/test-bucket' ]
args: ['-c', 'mc mb --ignore-existing local/{{ .Values.minio_local_bucket_name }}' ]
{{- end }}
- name: init-nginx

View File

@ -31,9 +31,19 @@ type: Opaque
stringData:
STORE_ACCESS_KEY: "{{ $storage.access_key }}"
STORE_SECRET_KEY: "{{ $storage.secret_key }}"
STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}{{ $storage.bucket_name }}/"
{{- if and $.Values.ingress.host $.Values.minio_local }}
{{- if $storage.bucket_name }}
STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}{{ $storage.bucket_name }}"
{{- else }}
STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}"
{{- end }}
{{- if $storage.access_endpoint_url }}
STORE_ACCESS_ENDPOINT_URL: "{{ $storage.access_endpoint_url }}"
{{- else if and $.Values.ingress.host $.Values.minio_local }}
STORE_ACCESS_ENDPOINT_URL: {{ $.Values.ingress.scheme | default "https" }}://{{ $.Values.ingress.host }}/data/{{ $storage.bucket_name }}/
{{- else }}
STORE_ACCESS_ENDPOINT_URL: "{{ $storage.endpoint_url }}"
{{- end }}
{{- end }}

View File

@ -57,7 +57,7 @@ redis_url: "redis://local-redis.default:6379/1"
# =========================================
crawler_image: "webrecorder/browsertrix-crawler:latest"
crawler_pull_policy: "Never"
crawler_pull_policy: "IfNotPresent"
crawler_namespace: "crawlers"
@ -68,19 +68,6 @@ crawl_retries: 1
crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037"
# Storage
# =========================================
storages:
- name: "default"
access_key: "ADMIN"
secret_key: "PASSW0RD"
bucket_name: "test-bucket"
endpoint_url: "http://local-minio.default:9000/"
# Local Minio Pod (optional)
# =========================================
# set to true to use a local minio image
@ -93,6 +80,21 @@ minio_image: minio/minio
minio_mc_image: minio/mc
minio_pull_policy: "IfNotPresent"
minio_local_bucket_name: &local_bucket_name "test-bucket"
# Storage
# =========================================
# should include the local minio bucket, if enabled, and any other available buckets for default storage
storages:
- name: "default"
access_key: "ADMIN"
secret_key: "PASSW0RD"
bucket_name: *local_bucket_name
endpoint_url: "http://local-minio.default:9000/"
# Deployment options
# =========================================

View File

@ -3,7 +3,7 @@ version: '3.5'
services:
backend:
build: ./backend
image: webrecorder/browsertrix-api
image: registry.digitalocean.com/btrix/webrecorder/browsertrix-api
ports:
- 8000:8000