Misc backend fixes for cloud deployment (#26)
* misc backend fixes: - fix running w/o local minio - ensure crawler image pull policy is configurable, loaded via chart value - use digitalocean repo for main backend image (for now) - add bucket_name to config only if using default bucket * enable all behaviors, support 'access_endpoint_url' for default storages * debugging: add 'no_delete_jobs' setting for k8s and docker to disable deletion of completed jobs
This commit is contained in:
		
							parent
							
								
									58eba70c68
								
							
						
					
					
						commit
						3d4d7049a2
					
				| @ -66,7 +66,7 @@ class RawCrawlConfig(BaseModel): | |||||||
|     combineWARC: Optional[bool] = False |     combineWARC: Optional[bool] = False | ||||||
| 
 | 
 | ||||||
|     logging: Optional[str] = "" |     logging: Optional[str] = "" | ||||||
|     behaviors: Optional[str] = "autoscroll" |     behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # ============================================================================ | # ============================================================================ | ||||||
|  | |||||||
| @ -44,6 +44,8 @@ class DockerManager: | |||||||
|         self.extra_crawl_params = extra_crawl_params or [] |         self.extra_crawl_params = extra_crawl_params or [] | ||||||
|         self._event_q = None |         self._event_q = None | ||||||
| 
 | 
 | ||||||
|  |         self.no_delete_on_fail = os.environ.get("NO_DELETE_ON_FAIL", "") | ||||||
|  | 
 | ||||||
|         self.storages = { |         self.storages = { | ||||||
|             "default": S3Storage( |             "default": S3Storage( | ||||||
|                 name="default", |                 name="default", | ||||||
| @ -111,7 +113,8 @@ class DockerManager: | |||||||
| 
 | 
 | ||||||
|             for container in results: |             for container in results: | ||||||
|                 print(f"Cleaning Up Orphan Container {container['Id']}", flush=True) |                 print(f"Cleaning Up Orphan Container {container['Id']}", flush=True) | ||||||
|                 await container.delete() |                 if not self.no_delete_on_fail: | ||||||
|  |                     await container.delete() | ||||||
| 
 | 
 | ||||||
|             results = await self.client.containers.list( |             results = await self.client.containers.list( | ||||||
|                 filters=json.dumps( |                 filters=json.dumps( | ||||||
| @ -482,8 +485,10 @@ class DockerManager: | |||||||
|         if actor["Attributes"]["exitCode"] != 0: |         if actor["Attributes"]["exitCode"] != 0: | ||||||
|             crawl = self._make_crawl_for_container(container, "failed", True) |             crawl = self._make_crawl_for_container(container, "failed", True) | ||||||
|             await self.crawl_ops.store_crawl(crawl) |             await self.crawl_ops.store_crawl(crawl) | ||||||
| 
 |             if not self.no_delete_on_fail: | ||||||
|         await container.delete() |                 await container.delete() | ||||||
|  |         else: | ||||||
|  |             await container.delete() | ||||||
| 
 | 
 | ||||||
|     # pylint: disable=no-self-use,too-many-arguments |     # pylint: disable=no-self-use,too-many-arguments | ||||||
|     def _make_crawl_for_container(self, container, state, finish_now=False): |     def _make_crawl_for_container(self, container, state, finish_now=False): | ||||||
|  | |||||||
| @ -37,11 +37,13 @@ class K8SManager: | |||||||
|         self.namespace = namespace |         self.namespace = namespace | ||||||
|         self._default_storage_endpoints = {} |         self._default_storage_endpoints = {} | ||||||
| 
 | 
 | ||||||
|         self.crawler_image = os.environ.get("CRAWLER_IMAGE") |         self.crawler_image = os.environ["CRAWLER_IMAGE"] | ||||||
|         self.crawler_image_pull_policy = "IfNotPresent" |         self.crawler_image_pull_policy = os.environ["CRAWLER_PULL_POLICY"] | ||||||
| 
 | 
 | ||||||
|         self.crawl_retries = int(os.environ.get("CRAWL_RETRIES", "3")) |         self.crawl_retries = int(os.environ.get("CRAWL_RETRIES", "3")) | ||||||
| 
 | 
 | ||||||
|  |         self.no_delete_jobs = os.environ.get("NO_DELETE_JOBS", "0") != "0" | ||||||
|  | 
 | ||||||
|         self.loop = asyncio.get_running_loop() |         self.loop = asyncio.get_running_loop() | ||||||
|         self.loop.create_task(self.run_event_loop()) |         self.loop.create_task(self.run_event_loop()) | ||||||
| 
 | 
 | ||||||
| @ -319,7 +321,7 @@ class K8SManager: | |||||||
|             return None, None |             return None, None | ||||||
| 
 | 
 | ||||||
|         manual = job.metadata.annotations.get("btrix.run.manual") == "1" |         manual = job.metadata.annotations.get("btrix.run.manual") == "1" | ||||||
|         if manual: |         if manual and not self.no_delete_jobs: | ||||||
|             self.loop.create_task(self._delete_job(job.metadata.name)) |             self.loop.create_task(self._delete_job(job.metadata.name)) | ||||||
| 
 | 
 | ||||||
|         crawl = self._make_crawl_for_job( |         crawl = self._make_crawl_for_job( | ||||||
| @ -457,7 +459,7 @@ class K8SManager: | |||||||
|         failure = await self.crawl_ops.store_crawl(crawl) |         failure = await self.crawl_ops.store_crawl(crawl) | ||||||
| 
 | 
 | ||||||
|         # keep failed jobs around, for now |         # keep failed jobs around, for now | ||||||
|         if not failure: |         if not failure and not self.no_delete_jobs: | ||||||
|             await self._delete_job(job_name) |             await self._delete_job(job_name) | ||||||
| 
 | 
 | ||||||
|     # ======================================================================== |     # ======================================================================== | ||||||
| @ -643,7 +645,7 @@ class K8SManager: | |||||||
|                             { |                             { | ||||||
|                                 "name": "crawler", |                                 "name": "crawler", | ||||||
|                                 "image": self.crawler_image, |                                 "image": self.crawler_image, | ||||||
|                                 "imagePullPolicy": "Never", |                                 "imagePullPolicy": self.crawler_image_pull_policy, | ||||||
|                                 "command": [ |                                 "command": [ | ||||||
|                                     "crawl", |                                     "crawl", | ||||||
|                                     "--config", |                                     "--config", | ||||||
|  | |||||||
| @ -10,6 +10,7 @@ data: | |||||||
| 
 | 
 | ||||||
|   CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }} |   CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }} | ||||||
|   CRAWLER_IMAGE: {{ .Values.crawler_image }} |   CRAWLER_IMAGE: {{ .Values.crawler_image }} | ||||||
|  |   CRAWLER_PULL_POLICY: {{ .Values.crawler_pull_policy }} | ||||||
| 
 | 
 | ||||||
|   CRAWL_TIMEOUT: "{{ .Values.crawl_timeout }}" |   CRAWL_TIMEOUT: "{{ .Values.crawl_timeout }}" | ||||||
|   CRAWL_RETRIES: "{{ .Values.crawl_retries }}" |   CRAWL_RETRIES: "{{ .Values.crawl_retries }}" | ||||||
| @ -18,6 +19,8 @@ data: | |||||||
| 
 | 
 | ||||||
|   REDIS_CRAWLS_DONE_KEY: "crawls-done" |   REDIS_CRAWLS_DONE_KEY: "crawls-done" | ||||||
| 
 | 
 | ||||||
|  |   NO_DELETE_JOBS: "{{ .Values.no_delete_jobs | default '0' }}" | ||||||
|  | 
 | ||||||
| --- | --- | ||||||
| apiVersion: v1 | apiVersion: v1 | ||||||
| kind: ConfigMap | kind: ConfigMap | ||||||
|  | |||||||
| @ -29,8 +29,8 @@ spec: | |||||||
|   rules: |   rules: | ||||||
|   - host: {{ .Values.ingress.host }} |   - host: {{ .Values.ingress.host }} | ||||||
|     http: |     http: | ||||||
| {{- if .Values.minio_local }} |  | ||||||
|       paths: |       paths: | ||||||
|  | {{- if .Values.minio_local }} | ||||||
|       - path: /data/(.*) |       - path: /data/(.*) | ||||||
|         pathType: Prefix |         pathType: Prefix | ||||||
|         backend: |         backend: | ||||||
| @ -51,7 +51,7 @@ spec: | |||||||
| {{ if .Values.ingress.tls }} | {{ if .Values.ingress.tls }} | ||||||
| --- | --- | ||||||
| 
 | 
 | ||||||
| apiVersion: cert-manager.io/v1alpha2 | apiVersion: cert-manager.io/v1 | ||||||
| kind: ClusterIssuer | kind: ClusterIssuer | ||||||
| metadata: | metadata: | ||||||
|   name: cert-main |   name: cert-main | ||||||
|  | |||||||
| @ -31,8 +31,8 @@ spec: | |||||||
|         - name: nginx-resolver |         - name: nginx-resolver | ||||||
|           emptyDir: {} |           emptyDir: {} | ||||||
| 
 | 
 | ||||||
| {{- if .Values.minio_local }} |  | ||||||
|       initContainers: |       initContainers: | ||||||
|  | {{- if .Values.minio_local }} | ||||||
|         - name: init-bucket |         - name: init-bucket | ||||||
|           image: {{ .Values.minio_mc_image }} |           image: {{ .Values.minio_mc_image }} | ||||||
|           imagePullPolicy: {{ .Values.minio_pull_policy }} |           imagePullPolicy: {{ .Values.minio_pull_policy }} | ||||||
| @ -44,7 +44,7 @@ spec: | |||||||
|                   key: MC_HOST |                   key: MC_HOST | ||||||
| 
 | 
 | ||||||
|           command: ['/bin/sh'] |           command: ['/bin/sh'] | ||||||
|           args: ['-c', 'mc mb local/test-bucket; mc policy set public local/test-bucket' ] |           args: ['-c', 'mc mb --ignore-existing local/{{ .Values.minio_local_bucket_name }}' ] | ||||||
| {{- end }} | {{- end }} | ||||||
| 
 | 
 | ||||||
|         - name: init-nginx |         - name: init-nginx | ||||||
|  | |||||||
| @ -31,9 +31,19 @@ type: Opaque | |||||||
| stringData: | stringData: | ||||||
|   STORE_ACCESS_KEY: "{{ $storage.access_key }}" |   STORE_ACCESS_KEY: "{{ $storage.access_key }}" | ||||||
|   STORE_SECRET_KEY: "{{ $storage.secret_key }}" |   STORE_SECRET_KEY: "{{ $storage.secret_key }}" | ||||||
|   STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}{{ $storage.bucket_name }}/" | 
 | ||||||
|   {{- if and $.Values.ingress.host $.Values.minio_local }} |   {{- if $storage.bucket_name }} | ||||||
|  |   STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}{{ $storage.bucket_name }}" | ||||||
|  |   {{- else }} | ||||||
|  |   STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}" | ||||||
|  |   {{- end }} | ||||||
|  | 
 | ||||||
|  |   {{- if $storage.access_endpoint_url }} | ||||||
|  |   STORE_ACCESS_ENDPOINT_URL: "{{ $storage.access_endpoint_url }}" | ||||||
|  |   {{- else if and $.Values.ingress.host $.Values.minio_local }} | ||||||
|   STORE_ACCESS_ENDPOINT_URL: {{ $.Values.ingress.scheme | default "https" }}://{{ $.Values.ingress.host }}/data/{{ $storage.bucket_name }}/ |   STORE_ACCESS_ENDPOINT_URL: {{ $.Values.ingress.scheme | default "https" }}://{{ $.Values.ingress.host }}/data/{{ $storage.bucket_name }}/ | ||||||
|  |   {{- else }} | ||||||
|  |   STORE_ACCESS_ENDPOINT_URL: "{{ $storage.endpoint_url }}" | ||||||
|   {{- end }} |   {{- end }} | ||||||
| 
 | 
 | ||||||
| {{- end }} | {{- end }} | ||||||
|  | |||||||
| @ -57,7 +57,7 @@ redis_url: "redis://local-redis.default:6379/1" | |||||||
| # ========================================= | # ========================================= | ||||||
| 
 | 
 | ||||||
| crawler_image: "webrecorder/browsertrix-crawler:latest" | crawler_image: "webrecorder/browsertrix-crawler:latest" | ||||||
| crawler_pull_policy: "Never" | crawler_pull_policy: "IfNotPresent" | ||||||
| 
 | 
 | ||||||
| crawler_namespace: "crawlers" | crawler_namespace: "crawlers" | ||||||
| 
 | 
 | ||||||
| @ -68,19 +68,6 @@ crawl_retries: 1 | |||||||
| crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037" | crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| # Storage |  | ||||||
| # ========================================= |  | ||||||
| 
 |  | ||||||
| storages: |  | ||||||
|   - name: "default" |  | ||||||
|     access_key: "ADMIN" |  | ||||||
|     secret_key: "PASSW0RD" |  | ||||||
|     bucket_name: "test-bucket" |  | ||||||
| 
 |  | ||||||
|     endpoint_url: "http://local-minio.default:9000/" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Local Minio Pod (optional) | # Local Minio Pod (optional) | ||||||
| # ========================================= | # ========================================= | ||||||
| # set to true to use a local minio image | # set to true to use a local minio image | ||||||
| @ -93,6 +80,21 @@ minio_image: minio/minio | |||||||
| minio_mc_image: minio/mc | minio_mc_image: minio/mc | ||||||
| minio_pull_policy: "IfNotPresent" | minio_pull_policy: "IfNotPresent" | ||||||
| 
 | 
 | ||||||
|  | minio_local_bucket_name: &local_bucket_name "test-bucket" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Storage | ||||||
|  | # ========================================= | ||||||
|  | # should include the local minio bucket, if enabled, and any other available buckets for default storage | ||||||
|  | 
 | ||||||
|  | storages: | ||||||
|  |   - name: "default" | ||||||
|  |     access_key: "ADMIN" | ||||||
|  |     secret_key: "PASSW0RD" | ||||||
|  |     bucket_name: *local_bucket_name | ||||||
|  | 
 | ||||||
|  |     endpoint_url: "http://local-minio.default:9000/" | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| # Deployment options | # Deployment options | ||||||
| # ========================================= | # ========================================= | ||||||
|  | |||||||
| @ -3,7 +3,7 @@ version: '3.5' | |||||||
| services: | services: | ||||||
|   backend: |   backend: | ||||||
|     build: ./backend |     build: ./backend | ||||||
|     image: webrecorder/browsertrix-api |     image: registry.digitalocean.com/btrix/webrecorder/browsertrix-api | ||||||
|     ports: |     ports: | ||||||
|       - 8000:8000 |       - 8000:8000 | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user