Misc backend fixes for cloud deployment (#26)
* misc backend fixes: - fix running w/o local minio - ensure crawler image pull policy is configurable, loaded via chart value - use digitalocean repo for main backend image (for now) - add bucket_name to config only if using default bucket * enable all behaviors, support 'access_endpoint_url' for default storages * debugging: add 'no_delete_jobs' setting for k8s and docker to disable deletion of completed jobs
This commit is contained in:
parent
58eba70c68
commit
3d4d7049a2
@ -66,7 +66,7 @@ class RawCrawlConfig(BaseModel):
|
|||||||
combineWARC: Optional[bool] = False
|
combineWARC: Optional[bool] = False
|
||||||
|
|
||||||
logging: Optional[str] = ""
|
logging: Optional[str] = ""
|
||||||
behaviors: Optional[str] = "autoscroll"
|
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
@ -44,6 +44,8 @@ class DockerManager:
|
|||||||
self.extra_crawl_params = extra_crawl_params or []
|
self.extra_crawl_params = extra_crawl_params or []
|
||||||
self._event_q = None
|
self._event_q = None
|
||||||
|
|
||||||
|
self.no_delete_on_fail = os.environ.get("NO_DELETE_ON_FAIL", "")
|
||||||
|
|
||||||
self.storages = {
|
self.storages = {
|
||||||
"default": S3Storage(
|
"default": S3Storage(
|
||||||
name="default",
|
name="default",
|
||||||
@ -111,7 +113,8 @@ class DockerManager:
|
|||||||
|
|
||||||
for container in results:
|
for container in results:
|
||||||
print(f"Cleaning Up Orphan Container {container['Id']}", flush=True)
|
print(f"Cleaning Up Orphan Container {container['Id']}", flush=True)
|
||||||
await container.delete()
|
if not self.no_delete_on_fail:
|
||||||
|
await container.delete()
|
||||||
|
|
||||||
results = await self.client.containers.list(
|
results = await self.client.containers.list(
|
||||||
filters=json.dumps(
|
filters=json.dumps(
|
||||||
@ -482,8 +485,10 @@ class DockerManager:
|
|||||||
if actor["Attributes"]["exitCode"] != 0:
|
if actor["Attributes"]["exitCode"] != 0:
|
||||||
crawl = self._make_crawl_for_container(container, "failed", True)
|
crawl = self._make_crawl_for_container(container, "failed", True)
|
||||||
await self.crawl_ops.store_crawl(crawl)
|
await self.crawl_ops.store_crawl(crawl)
|
||||||
|
if not self.no_delete_on_fail:
|
||||||
await container.delete()
|
await container.delete()
|
||||||
|
else:
|
||||||
|
await container.delete()
|
||||||
|
|
||||||
# pylint: disable=no-self-use,too-many-arguments
|
# pylint: disable=no-self-use,too-many-arguments
|
||||||
def _make_crawl_for_container(self, container, state, finish_now=False):
|
def _make_crawl_for_container(self, container, state, finish_now=False):
|
||||||
|
@ -37,11 +37,13 @@ class K8SManager:
|
|||||||
self.namespace = namespace
|
self.namespace = namespace
|
||||||
self._default_storage_endpoints = {}
|
self._default_storage_endpoints = {}
|
||||||
|
|
||||||
self.crawler_image = os.environ.get("CRAWLER_IMAGE")
|
self.crawler_image = os.environ["CRAWLER_IMAGE"]
|
||||||
self.crawler_image_pull_policy = "IfNotPresent"
|
self.crawler_image_pull_policy = os.environ["CRAWLER_PULL_POLICY"]
|
||||||
|
|
||||||
self.crawl_retries = int(os.environ.get("CRAWL_RETRIES", "3"))
|
self.crawl_retries = int(os.environ.get("CRAWL_RETRIES", "3"))
|
||||||
|
|
||||||
|
self.no_delete_jobs = os.environ.get("NO_DELETE_JOBS", "0") != "0"
|
||||||
|
|
||||||
self.loop = asyncio.get_running_loop()
|
self.loop = asyncio.get_running_loop()
|
||||||
self.loop.create_task(self.run_event_loop())
|
self.loop.create_task(self.run_event_loop())
|
||||||
|
|
||||||
@ -319,7 +321,7 @@ class K8SManager:
|
|||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
manual = job.metadata.annotations.get("btrix.run.manual") == "1"
|
manual = job.metadata.annotations.get("btrix.run.manual") == "1"
|
||||||
if manual:
|
if manual and not self.no_delete_jobs:
|
||||||
self.loop.create_task(self._delete_job(job.metadata.name))
|
self.loop.create_task(self._delete_job(job.metadata.name))
|
||||||
|
|
||||||
crawl = self._make_crawl_for_job(
|
crawl = self._make_crawl_for_job(
|
||||||
@ -457,7 +459,7 @@ class K8SManager:
|
|||||||
failure = await self.crawl_ops.store_crawl(crawl)
|
failure = await self.crawl_ops.store_crawl(crawl)
|
||||||
|
|
||||||
# keep failed jobs around, for now
|
# keep failed jobs around, for now
|
||||||
if not failure:
|
if not failure and not self.no_delete_jobs:
|
||||||
await self._delete_job(job_name)
|
await self._delete_job(job_name)
|
||||||
|
|
||||||
# ========================================================================
|
# ========================================================================
|
||||||
@ -643,7 +645,7 @@ class K8SManager:
|
|||||||
{
|
{
|
||||||
"name": "crawler",
|
"name": "crawler",
|
||||||
"image": self.crawler_image,
|
"image": self.crawler_image,
|
||||||
"imagePullPolicy": "Never",
|
"imagePullPolicy": self.crawler_image_pull_policy,
|
||||||
"command": [
|
"command": [
|
||||||
"crawl",
|
"crawl",
|
||||||
"--config",
|
"--config",
|
||||||
|
@ -10,6 +10,7 @@ data:
|
|||||||
|
|
||||||
CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }}
|
CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }}
|
||||||
CRAWLER_IMAGE: {{ .Values.crawler_image }}
|
CRAWLER_IMAGE: {{ .Values.crawler_image }}
|
||||||
|
CRAWLER_PULL_POLICY: {{ .Values.crawler_pull_policy }}
|
||||||
|
|
||||||
CRAWL_TIMEOUT: "{{ .Values.crawl_timeout }}"
|
CRAWL_TIMEOUT: "{{ .Values.crawl_timeout }}"
|
||||||
CRAWL_RETRIES: "{{ .Values.crawl_retries }}"
|
CRAWL_RETRIES: "{{ .Values.crawl_retries }}"
|
||||||
@ -18,6 +19,8 @@ data:
|
|||||||
|
|
||||||
REDIS_CRAWLS_DONE_KEY: "crawls-done"
|
REDIS_CRAWLS_DONE_KEY: "crawls-done"
|
||||||
|
|
||||||
|
NO_DELETE_JOBS: "{{ .Values.no_delete_jobs | default '0' }}"
|
||||||
|
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
|
@ -29,8 +29,8 @@ spec:
|
|||||||
rules:
|
rules:
|
||||||
- host: {{ .Values.ingress.host }}
|
- host: {{ .Values.ingress.host }}
|
||||||
http:
|
http:
|
||||||
{{- if .Values.minio_local }}
|
|
||||||
paths:
|
paths:
|
||||||
|
{{- if .Values.minio_local }}
|
||||||
- path: /data/(.*)
|
- path: /data/(.*)
|
||||||
pathType: Prefix
|
pathType: Prefix
|
||||||
backend:
|
backend:
|
||||||
@ -51,7 +51,7 @@ spec:
|
|||||||
{{ if .Values.ingress.tls }}
|
{{ if .Values.ingress.tls }}
|
||||||
---
|
---
|
||||||
|
|
||||||
apiVersion: cert-manager.io/v1alpha2
|
apiVersion: cert-manager.io/v1
|
||||||
kind: ClusterIssuer
|
kind: ClusterIssuer
|
||||||
metadata:
|
metadata:
|
||||||
name: cert-main
|
name: cert-main
|
||||||
|
@ -31,8 +31,8 @@ spec:
|
|||||||
- name: nginx-resolver
|
- name: nginx-resolver
|
||||||
emptyDir: {}
|
emptyDir: {}
|
||||||
|
|
||||||
{{- if .Values.minio_local }}
|
|
||||||
initContainers:
|
initContainers:
|
||||||
|
{{- if .Values.minio_local }}
|
||||||
- name: init-bucket
|
- name: init-bucket
|
||||||
image: {{ .Values.minio_mc_image }}
|
image: {{ .Values.minio_mc_image }}
|
||||||
imagePullPolicy: {{ .Values.minio_pull_policy }}
|
imagePullPolicy: {{ .Values.minio_pull_policy }}
|
||||||
@ -44,7 +44,7 @@ spec:
|
|||||||
key: MC_HOST
|
key: MC_HOST
|
||||||
|
|
||||||
command: ['/bin/sh']
|
command: ['/bin/sh']
|
||||||
args: ['-c', 'mc mb local/test-bucket; mc policy set public local/test-bucket' ]
|
args: ['-c', 'mc mb --ignore-existing local/{{ .Values.minio_local_bucket_name }}' ]
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
- name: init-nginx
|
- name: init-nginx
|
||||||
|
@ -31,9 +31,19 @@ type: Opaque
|
|||||||
stringData:
|
stringData:
|
||||||
STORE_ACCESS_KEY: "{{ $storage.access_key }}"
|
STORE_ACCESS_KEY: "{{ $storage.access_key }}"
|
||||||
STORE_SECRET_KEY: "{{ $storage.secret_key }}"
|
STORE_SECRET_KEY: "{{ $storage.secret_key }}"
|
||||||
STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}{{ $storage.bucket_name }}/"
|
|
||||||
{{- if and $.Values.ingress.host $.Values.minio_local }}
|
{{- if $storage.bucket_name }}
|
||||||
|
STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}{{ $storage.bucket_name }}"
|
||||||
|
{{- else }}
|
||||||
|
STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}"
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{- if $storage.access_endpoint_url }}
|
||||||
|
STORE_ACCESS_ENDPOINT_URL: "{{ $storage.access_endpoint_url }}"
|
||||||
|
{{- else if and $.Values.ingress.host $.Values.minio_local }}
|
||||||
STORE_ACCESS_ENDPOINT_URL: {{ $.Values.ingress.scheme | default "https" }}://{{ $.Values.ingress.host }}/data/{{ $storage.bucket_name }}/
|
STORE_ACCESS_ENDPOINT_URL: {{ $.Values.ingress.scheme | default "https" }}://{{ $.Values.ingress.host }}/data/{{ $storage.bucket_name }}/
|
||||||
|
{{- else }}
|
||||||
|
STORE_ACCESS_ENDPOINT_URL: "{{ $storage.endpoint_url }}"
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
@ -57,7 +57,7 @@ redis_url: "redis://local-redis.default:6379/1"
|
|||||||
# =========================================
|
# =========================================
|
||||||
|
|
||||||
crawler_image: "webrecorder/browsertrix-crawler:latest"
|
crawler_image: "webrecorder/browsertrix-crawler:latest"
|
||||||
crawler_pull_policy: "Never"
|
crawler_pull_policy: "IfNotPresent"
|
||||||
|
|
||||||
crawler_namespace: "crawlers"
|
crawler_namespace: "crawlers"
|
||||||
|
|
||||||
@ -68,19 +68,6 @@ crawl_retries: 1
|
|||||||
crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037"
|
crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Storage
|
|
||||||
# =========================================
|
|
||||||
|
|
||||||
storages:
|
|
||||||
- name: "default"
|
|
||||||
access_key: "ADMIN"
|
|
||||||
secret_key: "PASSW0RD"
|
|
||||||
bucket_name: "test-bucket"
|
|
||||||
|
|
||||||
endpoint_url: "http://local-minio.default:9000/"
|
|
||||||
|
|
||||||
|
|
||||||
# Local Minio Pod (optional)
|
# Local Minio Pod (optional)
|
||||||
# =========================================
|
# =========================================
|
||||||
# set to true to use a local minio image
|
# set to true to use a local minio image
|
||||||
@ -93,6 +80,21 @@ minio_image: minio/minio
|
|||||||
minio_mc_image: minio/mc
|
minio_mc_image: minio/mc
|
||||||
minio_pull_policy: "IfNotPresent"
|
minio_pull_policy: "IfNotPresent"
|
||||||
|
|
||||||
|
minio_local_bucket_name: &local_bucket_name "test-bucket"
|
||||||
|
|
||||||
|
|
||||||
|
# Storage
|
||||||
|
# =========================================
|
||||||
|
# should include the local minio bucket, if enabled, and any other available buckets for default storage
|
||||||
|
|
||||||
|
storages:
|
||||||
|
- name: "default"
|
||||||
|
access_key: "ADMIN"
|
||||||
|
secret_key: "PASSW0RD"
|
||||||
|
bucket_name: *local_bucket_name
|
||||||
|
|
||||||
|
endpoint_url: "http://local-minio.default:9000/"
|
||||||
|
|
||||||
|
|
||||||
# Deployment options
|
# Deployment options
|
||||||
# =========================================
|
# =========================================
|
||||||
|
@ -3,7 +3,7 @@ version: '3.5'
|
|||||||
services:
|
services:
|
||||||
backend:
|
backend:
|
||||||
build: ./backend
|
build: ./backend
|
||||||
image: webrecorder/browsertrix-api
|
image: registry.digitalocean.com/btrix/webrecorder/browsertrix-api
|
||||||
ports:
|
ports:
|
||||||
- 8000:8000
|
- 8000:8000
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user