Misc backend fixes for cloud deployment (#26)
* misc backend fixes: - fix running w/o local minio - ensure crawler image pull policy is configurable, loaded via chart value - use digitalocean repo for main backend image (for now) - add bucket_name to config only if using default bucket * enable all behaviors, support 'access_endpoint_url' for default storages * debugging: add 'no_delete_jobs' setting for k8s and docker to disable deletion of completed jobs
This commit is contained in:
		
							parent
							
								
									58eba70c68
								
							
						
					
					
						commit
						3d4d7049a2
					
				| @ -66,7 +66,7 @@ class RawCrawlConfig(BaseModel): | ||||
|     combineWARC: Optional[bool] = False | ||||
| 
 | ||||
|     logging: Optional[str] = "" | ||||
|     behaviors: Optional[str] = "autoscroll" | ||||
|     behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific" | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
|  | ||||
| @ -44,6 +44,8 @@ class DockerManager: | ||||
|         self.extra_crawl_params = extra_crawl_params or [] | ||||
|         self._event_q = None | ||||
| 
 | ||||
|         self.no_delete_on_fail = os.environ.get("NO_DELETE_ON_FAIL", "") | ||||
| 
 | ||||
|         self.storages = { | ||||
|             "default": S3Storage( | ||||
|                 name="default", | ||||
| @ -111,6 +113,7 @@ class DockerManager: | ||||
| 
 | ||||
|             for container in results: | ||||
|                 print(f"Cleaning Up Orphan Container {container['Id']}", flush=True) | ||||
|                 if not self.no_delete_on_fail: | ||||
|                     await container.delete() | ||||
| 
 | ||||
|             results = await self.client.containers.list( | ||||
| @ -482,7 +485,9 @@ class DockerManager: | ||||
|         if actor["Attributes"]["exitCode"] != 0: | ||||
|             crawl = self._make_crawl_for_container(container, "failed", True) | ||||
|             await self.crawl_ops.store_crawl(crawl) | ||||
| 
 | ||||
|             if not self.no_delete_on_fail: | ||||
|                 await container.delete() | ||||
|         else: | ||||
|             await container.delete() | ||||
| 
 | ||||
|     # pylint: disable=no-self-use,too-many-arguments | ||||
|  | ||||
| @ -37,11 +37,13 @@ class K8SManager: | ||||
|         self.namespace = namespace | ||||
|         self._default_storage_endpoints = {} | ||||
| 
 | ||||
|         self.crawler_image = os.environ.get("CRAWLER_IMAGE") | ||||
|         self.crawler_image_pull_policy = "IfNotPresent" | ||||
|         self.crawler_image = os.environ["CRAWLER_IMAGE"] | ||||
|         self.crawler_image_pull_policy = os.environ["CRAWLER_PULL_POLICY"] | ||||
| 
 | ||||
|         self.crawl_retries = int(os.environ.get("CRAWL_RETRIES", "3")) | ||||
| 
 | ||||
|         self.no_delete_jobs = os.environ.get("NO_DELETE_JOBS", "0") != "0" | ||||
| 
 | ||||
|         self.loop = asyncio.get_running_loop() | ||||
|         self.loop.create_task(self.run_event_loop()) | ||||
| 
 | ||||
| @ -319,7 +321,7 @@ class K8SManager: | ||||
|             return None, None | ||||
| 
 | ||||
|         manual = job.metadata.annotations.get("btrix.run.manual") == "1" | ||||
|         if manual: | ||||
|         if manual and not self.no_delete_jobs: | ||||
|             self.loop.create_task(self._delete_job(job.metadata.name)) | ||||
| 
 | ||||
|         crawl = self._make_crawl_for_job( | ||||
| @ -457,7 +459,7 @@ class K8SManager: | ||||
|         failure = await self.crawl_ops.store_crawl(crawl) | ||||
| 
 | ||||
|         # keep failed jobs around, for now | ||||
|         if not failure: | ||||
|         if not failure and not self.no_delete_jobs: | ||||
|             await self._delete_job(job_name) | ||||
| 
 | ||||
|     # ======================================================================== | ||||
| @ -643,7 +645,7 @@ class K8SManager: | ||||
|                             { | ||||
|                                 "name": "crawler", | ||||
|                                 "image": self.crawler_image, | ||||
|                                 "imagePullPolicy": "Never", | ||||
|                                 "imagePullPolicy": self.crawler_image_pull_policy, | ||||
|                                 "command": [ | ||||
|                                     "crawl", | ||||
|                                     "--config", | ||||
|  | ||||
| @ -10,6 +10,7 @@ data: | ||||
| 
 | ||||
|   CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }} | ||||
|   CRAWLER_IMAGE: {{ .Values.crawler_image }} | ||||
|   CRAWLER_PULL_POLICY: {{ .Values.crawler_pull_policy }} | ||||
| 
 | ||||
|   CRAWL_TIMEOUT: "{{ .Values.crawl_timeout }}" | ||||
|   CRAWL_RETRIES: "{{ .Values.crawl_retries }}" | ||||
| @ -18,6 +19,8 @@ data: | ||||
| 
 | ||||
|   REDIS_CRAWLS_DONE_KEY: "crawls-done" | ||||
| 
 | ||||
|   NO_DELETE_JOBS: "{{ .Values.no_delete_jobs | default '0' }}" | ||||
| 
 | ||||
| --- | ||||
| apiVersion: v1 | ||||
| kind: ConfigMap | ||||
|  | ||||
| @ -29,8 +29,8 @@ spec: | ||||
|   rules: | ||||
|   - host: {{ .Values.ingress.host }} | ||||
|     http: | ||||
| {{- if .Values.minio_local }} | ||||
|       paths: | ||||
| {{- if .Values.minio_local }} | ||||
|       - path: /data/(.*) | ||||
|         pathType: Prefix | ||||
|         backend: | ||||
| @ -51,7 +51,7 @@ spec: | ||||
| {{ if .Values.ingress.tls }} | ||||
| --- | ||||
| 
 | ||||
| apiVersion: cert-manager.io/v1alpha2 | ||||
| apiVersion: cert-manager.io/v1 | ||||
| kind: ClusterIssuer | ||||
| metadata: | ||||
|   name: cert-main | ||||
|  | ||||
| @ -31,8 +31,8 @@ spec: | ||||
|         - name: nginx-resolver | ||||
|           emptyDir: {} | ||||
| 
 | ||||
| {{- if .Values.minio_local }} | ||||
|       initContainers: | ||||
| {{- if .Values.minio_local }} | ||||
|         - name: init-bucket | ||||
|           image: {{ .Values.minio_mc_image }} | ||||
|           imagePullPolicy: {{ .Values.minio_pull_policy }} | ||||
| @ -44,7 +44,7 @@ spec: | ||||
|                   key: MC_HOST | ||||
| 
 | ||||
|           command: ['/bin/sh'] | ||||
|           args: ['-c', 'mc mb local/test-bucket; mc policy set public local/test-bucket' ] | ||||
|           args: ['-c', 'mc mb --ignore-existing local/{{ .Values.minio_local_bucket_name }}' ] | ||||
| {{- end }} | ||||
| 
 | ||||
|         - name: init-nginx | ||||
|  | ||||
| @ -31,9 +31,19 @@ type: Opaque | ||||
| stringData: | ||||
|   STORE_ACCESS_KEY: "{{ $storage.access_key }}" | ||||
|   STORE_SECRET_KEY: "{{ $storage.secret_key }}" | ||||
|   STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}{{ $storage.bucket_name }}/" | ||||
|   {{- if and $.Values.ingress.host $.Values.minio_local }} | ||||
| 
 | ||||
|   {{- if $storage.bucket_name }} | ||||
|   STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}{{ $storage.bucket_name }}" | ||||
|   {{- else }} | ||||
|   STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}" | ||||
|   {{- end }} | ||||
| 
 | ||||
|   {{- if $storage.access_endpoint_url }} | ||||
|   STORE_ACCESS_ENDPOINT_URL: "{{ $storage.access_endpoint_url }}" | ||||
|   {{- else if and $.Values.ingress.host $.Values.minio_local }} | ||||
|   STORE_ACCESS_ENDPOINT_URL: {{ $.Values.ingress.scheme | default "https" }}://{{ $.Values.ingress.host }}/data/{{ $storage.bucket_name }}/ | ||||
|   {{- else }} | ||||
|   STORE_ACCESS_ENDPOINT_URL: "{{ $storage.endpoint_url }}" | ||||
|   {{- end }} | ||||
| 
 | ||||
| {{- end }} | ||||
|  | ||||
| @ -57,7 +57,7 @@ redis_url: "redis://local-redis.default:6379/1" | ||||
| # ========================================= | ||||
| 
 | ||||
| crawler_image: "webrecorder/browsertrix-crawler:latest" | ||||
| crawler_pull_policy: "Never" | ||||
| crawler_pull_policy: "IfNotPresent" | ||||
| 
 | ||||
| crawler_namespace: "crawlers" | ||||
| 
 | ||||
| @ -68,19 +68,6 @@ crawl_retries: 1 | ||||
| crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037" | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| # Storage | ||||
| # ========================================= | ||||
| 
 | ||||
| storages: | ||||
|   - name: "default" | ||||
|     access_key: "ADMIN" | ||||
|     secret_key: "PASSW0RD" | ||||
|     bucket_name: "test-bucket" | ||||
| 
 | ||||
|     endpoint_url: "http://local-minio.default:9000/" | ||||
| 
 | ||||
| 
 | ||||
| # Local Minio Pod (optional) | ||||
| # ========================================= | ||||
| # set to true to use a local minio image | ||||
| @ -93,6 +80,21 @@ minio_image: minio/minio | ||||
| minio_mc_image: minio/mc | ||||
| minio_pull_policy: "IfNotPresent" | ||||
| 
 | ||||
| minio_local_bucket_name: &local_bucket_name "test-bucket" | ||||
| 
 | ||||
| 
 | ||||
| # Storage | ||||
| # ========================================= | ||||
| # should include the local minio bucket, if enabled, and any other available buckets for default storage | ||||
| 
 | ||||
| storages: | ||||
|   - name: "default" | ||||
|     access_key: "ADMIN" | ||||
|     secret_key: "PASSW0RD" | ||||
|     bucket_name: *local_bucket_name | ||||
| 
 | ||||
|     endpoint_url: "http://local-minio.default:9000/" | ||||
| 
 | ||||
| 
 | ||||
| # Deployment options | ||||
| # ========================================= | ||||
|  | ||||
| @ -3,7 +3,7 @@ version: '3.5' | ||||
| services: | ||||
|   backend: | ||||
|     build: ./backend | ||||
|     image: webrecorder/browsertrix-api | ||||
|     image: registry.digitalocean.com/btrix/webrecorder/browsertrix-api | ||||
|     ports: | ||||
|       - 8000:8000 | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user