From 3d4d7049a2aa63cc3d70dbb8ed10232d3af81265 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 25 Nov 2021 11:58:26 -0800 Subject: [PATCH] Misc backend fixes for cloud deployment (#26) * misc backend fixes: - fix running w/o local minio - ensure crawler image pull policy is configurable, loaded via chart value - use digitalocean repo for main backend image (for now) - add bucket_name to config only if using default bucket * enable all behaviors, support 'access_endpoint_url' for default storages * debugging: add 'no_delete_jobs' setting for k8s and docker to disable deletion of completed jobs --- backend/crawlconfigs.py | 2 +- backend/dockerman.py | 11 ++++++++--- backend/k8sman.py | 12 +++++++----- chart/templates/configmap.yaml | 3 +++ chart/templates/ingress.yaml | 4 ++-- chart/templates/main.yaml | 4 ++-- chart/templates/secrets.yaml | 14 ++++++++++++-- chart/values.yaml | 30 ++++++++++++++++-------------- docker-compose.yml | 2 +- 9 files changed, 52 insertions(+), 30 deletions(-) diff --git a/backend/crawlconfigs.py b/backend/crawlconfigs.py index 1987630f..1ef4e34c 100644 --- a/backend/crawlconfigs.py +++ b/backend/crawlconfigs.py @@ -66,7 +66,7 @@ class RawCrawlConfig(BaseModel): combineWARC: Optional[bool] = False logging: Optional[str] = "" - behaviors: Optional[str] = "autoscroll" + behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific" # ============================================================================ diff --git a/backend/dockerman.py b/backend/dockerman.py index 27ff6777..7f420607 100644 --- a/backend/dockerman.py +++ b/backend/dockerman.py @@ -44,6 +44,8 @@ class DockerManager: self.extra_crawl_params = extra_crawl_params or [] self._event_q = None + self.no_delete_on_fail = os.environ.get("NO_DELETE_ON_FAIL", "") + self.storages = { "default": S3Storage( name="default", @@ -111,7 +113,8 @@ class DockerManager: for container in results: print(f"Cleaning Up Orphan Container {container['Id']}", flush=True) - await container.delete() + if not self.no_delete_on_fail: + await container.delete() results = await self.client.containers.list( filters=json.dumps( @@ -482,8 +485,10 @@ class DockerManager: if actor["Attributes"]["exitCode"] != 0: crawl = self._make_crawl_for_container(container, "failed", True) await self.crawl_ops.store_crawl(crawl) - - await container.delete() + if not self.no_delete_on_fail: + await container.delete() + else: + await container.delete() # pylint: disable=no-self-use,too-many-arguments def _make_crawl_for_container(self, container, state, finish_now=False): diff --git a/backend/k8sman.py b/backend/k8sman.py index 01b11bab..89184005 100644 --- a/backend/k8sman.py +++ b/backend/k8sman.py @@ -37,11 +37,13 @@ class K8SManager: self.namespace = namespace self._default_storage_endpoints = {} - self.crawler_image = os.environ.get("CRAWLER_IMAGE") - self.crawler_image_pull_policy = "IfNotPresent" + self.crawler_image = os.environ["CRAWLER_IMAGE"] + self.crawler_image_pull_policy = os.environ["CRAWLER_PULL_POLICY"] self.crawl_retries = int(os.environ.get("CRAWL_RETRIES", "3")) + self.no_delete_jobs = os.environ.get("NO_DELETE_JOBS", "0") != "0" + self.loop = asyncio.get_running_loop() self.loop.create_task(self.run_event_loop()) @@ -319,7 +321,7 @@ class K8SManager: return None, None manual = job.metadata.annotations.get("btrix.run.manual") == "1" - if manual: + if manual and not self.no_delete_jobs: self.loop.create_task(self._delete_job(job.metadata.name)) crawl = self._make_crawl_for_job( @@ -457,7 +459,7 @@ class K8SManager: failure = await self.crawl_ops.store_crawl(crawl) # keep failed jobs around, for now - if not failure: + if not failure and not self.no_delete_jobs: await self._delete_job(job_name) # ======================================================================== @@ -643,7 +645,7 @@ class K8SManager: { "name": "crawler", "image": self.crawler_image, - "imagePullPolicy": "Never", + "imagePullPolicy": self.crawler_image_pull_policy, "command": [ "crawl", "--config", diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 1c4d8aa9..35730f74 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -10,6 +10,7 @@ data: CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }} CRAWLER_IMAGE: {{ .Values.crawler_image }} + CRAWLER_PULL_POLICY: {{ .Values.crawler_pull_policy }} CRAWL_TIMEOUT: "{{ .Values.crawl_timeout }}" CRAWL_RETRIES: "{{ .Values.crawl_retries }}" @@ -18,6 +19,8 @@ data: REDIS_CRAWLS_DONE_KEY: "crawls-done" + NO_DELETE_JOBS: "{{ .Values.no_delete_jobs | default '0' }}" + --- apiVersion: v1 kind: ConfigMap diff --git a/chart/templates/ingress.yaml b/chart/templates/ingress.yaml index 41073a29..ee51c30f 100644 --- a/chart/templates/ingress.yaml +++ b/chart/templates/ingress.yaml @@ -29,8 +29,8 @@ spec: rules: - host: {{ .Values.ingress.host }} http: -{{- if .Values.minio_local }} paths: +{{- if .Values.minio_local }} - path: /data/(.*) pathType: Prefix backend: @@ -51,7 +51,7 @@ spec: {{ if .Values.ingress.tls }} --- -apiVersion: cert-manager.io/v1alpha2 +apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: name: cert-main diff --git a/chart/templates/main.yaml b/chart/templates/main.yaml index efdc0860..f1fd70e0 100644 --- a/chart/templates/main.yaml +++ b/chart/templates/main.yaml @@ -31,8 +31,8 @@ spec: - name: nginx-resolver emptyDir: {} -{{- if .Values.minio_local }} initContainers: +{{- if .Values.minio_local }} - name: init-bucket image: {{ .Values.minio_mc_image }} imagePullPolicy: {{ .Values.minio_pull_policy }} @@ -44,7 +44,7 @@ spec: key: MC_HOST command: ['/bin/sh'] - args: ['-c', 'mc mb local/test-bucket; mc policy set public local/test-bucket' ] + args: ['-c', 'mc mb --ignore-existing local/{{ .Values.minio_local_bucket_name }}' ] {{- end }} - name: init-nginx diff --git a/chart/templates/secrets.yaml b/chart/templates/secrets.yaml index ecaf87b3..12ba59dc 100644 --- a/chart/templates/secrets.yaml +++ b/chart/templates/secrets.yaml @@ -31,9 +31,19 @@ type: Opaque stringData: STORE_ACCESS_KEY: "{{ $storage.access_key }}" STORE_SECRET_KEY: "{{ $storage.secret_key }}" - STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}{{ $storage.bucket_name }}/" - {{- if and $.Values.ingress.host $.Values.minio_local }} + + {{- if $storage.bucket_name }} + STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}{{ $storage.bucket_name }}" + {{- else }} + STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}" + {{- end }} + + {{- if $storage.access_endpoint_url }} + STORE_ACCESS_ENDPOINT_URL: "{{ $storage.access_endpoint_url }}" + {{- else if and $.Values.ingress.host $.Values.minio_local }} STORE_ACCESS_ENDPOINT_URL: {{ $.Values.ingress.scheme | default "https" }}://{{ $.Values.ingress.host }}/data/{{ $storage.bucket_name }}/ + {{- else }} + STORE_ACCESS_ENDPOINT_URL: "{{ $storage.endpoint_url }}" {{- end }} {{- end }} diff --git a/chart/values.yaml b/chart/values.yaml index c18c6eaa..d3607c1a 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -57,7 +57,7 @@ redis_url: "redis://local-redis.default:6379/1" # ========================================= crawler_image: "webrecorder/browsertrix-crawler:latest" -crawler_pull_policy: "Never" +crawler_pull_policy: "IfNotPresent" crawler_namespace: "crawlers" @@ -68,19 +68,6 @@ crawl_retries: 1 crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037" - -# Storage -# ========================================= - -storages: - - name: "default" - access_key: "ADMIN" - secret_key: "PASSW0RD" - bucket_name: "test-bucket" - - endpoint_url: "http://local-minio.default:9000/" - - # Local Minio Pod (optional) # ========================================= # set to true to use a local minio image @@ -93,6 +80,21 @@ minio_image: minio/minio minio_mc_image: minio/mc minio_pull_policy: "IfNotPresent" +minio_local_bucket_name: &local_bucket_name "test-bucket" + + +# Storage +# ========================================= +# should include the local minio bucket, if enabled, and any other available buckets for default storage + +storages: + - name: "default" + access_key: "ADMIN" + secret_key: "PASSW0RD" + bucket_name: *local_bucket_name + + endpoint_url: "http://local-minio.default:9000/" + # Deployment options # ========================================= diff --git a/docker-compose.yml b/docker-compose.yml index 5a1709c6..b90a122b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,7 +3,7 @@ version: '3.5' services: backend: build: ./backend - image: webrecorder/browsertrix-api + image: registry.digitalocean.com/btrix/webrecorder/browsertrix-api ports: - 8000:8000