diff --git a/.gitignore b/.gitignore index ba294346..27803a55 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ **/*.pyc **/node_modules/ -config.env +**/config.env diff --git a/backend/crawlconfigs.py b/backend/crawlconfigs.py index d027666e..ac0ef0b5 100644 --- a/backend/crawlconfigs.py +++ b/backend/crawlconfigs.py @@ -135,11 +135,11 @@ class CrawlOps: crawlconfig = CrawlConfig.from_dict(data) - await self.crawl_manager.add_crawl_config( + new_name = await self.crawl_manager.add_crawl_config( crawlconfig=crawlconfig, storage=archive.storage ) - return result + return result, new_name async def update_crawl_schedule(self, cid: str, update: UpdateSchedule): """ Update schedule for existing crawl config""" @@ -216,8 +216,8 @@ def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager): archive: Archive = Depends(archive_crawl_dep), user: User = Depends(user_dep), ): - res = await ops.add_crawl_config(config, archive, user) - return {"added": str(res.inserted_id)} + res, new_job_name = await ops.add_crawl_config(config, archive, user) + return {"added": str(res.inserted_id), "run_now_job": new_job_name} @router.patch("/{cid}/schedule", dependencies=[Depends(archive_crawl_dep)]) async def update_crawl_schedule( diff --git a/backend/crawls.py b/backend/crawls.py index cdea69d1..54076f0d 100644 --- a/backend/crawls.py +++ b/backend/crawls.py @@ -5,7 +5,7 @@ import asyncio from typing import Optional, List, Dict from datetime import datetime -from fastapi import Depends, HTTPException +from fastapi import Depends, Request, HTTPException from pydantic import BaseModel import pymongo import aioredis @@ -197,7 +197,7 @@ class CrawlOps: # ============================================================================ -# pylint: disable=too-many-arguments +# pylint: disable=too-many-arguments, too-many-locals def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archives): """ API for crawl management, including crawl done callback""" @@ -276,6 +276,16 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv return {"deleted": res} + @app.get( + "/archives/{aid}/crawls/{crawl_id}/running", + tags=["crawls"], + ) + async def get_running(crawl_id, archive: Archive = Depends(archive_crawl_dep)): + if not crawl_manager.is_running(crawl_id, archive.id): + raise HTTPException(status_code=404, detail="No Such Crawl") + + return {"running": True} + @app.post( "/archives/{aid}/crawls/{crawl_id}/scale", tags=["crawls"], @@ -289,3 +299,11 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv raise HTTPException(status_code=400, detail=error) return {"scaled": scale.scale} + + @app.post("/archives/{aid}/crawls/{crawl_id}/watch", tags=["crawls"]) + async def watch_crawl( + crawl_id, request: Request, archive: Archive = Depends(archive_crawl_dep) + ): + await crawl_manager.init_crawl_screencast(crawl_id, archive.id) + watch_url = f"{request.url.scheme}://{request.url.netloc}/watch/{archive.id}/{crawl_id}/ws" + return {"watch_url": watch_url} diff --git a/backend/k8sman.py b/backend/k8sman.py index 3212bd5d..957ac75b 100644 --- a/backend/k8sman.py +++ b/backend/k8sman.py @@ -216,9 +216,10 @@ class K8SManager: # Run Job Now if run_now: - await self._create_run_now_job(cron_job) + new_job = await self._create_run_now_job(cron_job) + return new_job.metadata.name - return cron_job + return "" async def update_crawl_schedule(self, cid, schedule): """ Update the schedule for existing crawl config """ @@ -282,6 +283,31 @@ class K8SManager: if job.status.active ] + async def init_crawl_screencast(self, crawl_id, aid): + """ Init service for this job/crawl_id to support screencasting """ + labels = {"btrix.archive": aid} + + service = client.V1Service( + kind="Service", + api_version="v1", + metadata={ + "name": crawl_id, + "labels": labels, + }, + spec={ + "selector": {"job-name": crawl_id}, + "ports": [{"protocol": "TCP", "port": 9037, "name": "screencast"}], + }, + ) + + try: + await self.core_api.create_namespaced_service( + body=service, namespace=self.namespace + ) + except client.exceptions.ApiException as api_exc: + if api_exc.status != 409: + raise api_exc + async def process_crawl_complete(self, crawlcomplete): """Ensure the crawlcomplete data is valid (job exists and user matches) Fill in additional details about the crawl""" @@ -315,6 +341,21 @@ class K8SManager: return crawl, crawl_file + async def is_running(self, job_name, aid): + """ Return true if the specified crawl (by job_name) is running """ + try: + job = await self.batch_api.read_namespaced_job( + name=job_name, namespace=self.namespace + ) + + if not job or job.metadata.labels["btrix.archive"] != aid: + return False + + return True + # pylint: disable=broad-except + except Exception: + return False + async def stop_crawl(self, job_name, aid, graceful=True): """Attempt to stop crawl, either gracefully by issuing a SIGTERM which will attempt to finish current pages @@ -435,6 +476,17 @@ class K8SManager: propagation_policy="Foreground", ) + try: + await self.core_api.delete_namespaced_service( + name=name, + namespace=self.namespace, + grace_period_seconds=60, + propagation_policy="Foreground", + ) + # pylint: disable=bare-except + except: + pass + def _create_config_map(self, crawlconfig, labels): """ Create Config Map based on CrawlConfig + labels """ config_map = client.V1ConfigMap( @@ -498,12 +550,6 @@ class K8SManager: propagation_policy="Foreground", ) - # await self.core_api.delete_collection_namespaced_secret( - # namespace=self.namespace, - # label_selector=label, - # propagation_policy="Foreground", - # ) - await self.core_api.delete_collection_namespaced_config_map( namespace=self.namespace, label_selector=label, @@ -527,7 +573,6 @@ class K8SManager: ts_now = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") name = f"crawl-now-{ts_now}-{cron_job.metadata.labels['btrix.crawlconfig']}" - print("NAME", name, flush=True) object_meta = client.V1ObjectMeta( name=name, diff --git a/backend/users.py b/backend/users.py index f3a1138f..620a88b2 100644 --- a/backend/users.py +++ b/backend/users.py @@ -44,6 +44,7 @@ class User(models.BaseUser): Base User Model """ + # ============================================================================ class UserCreate(models.BaseUserCreate): """ diff --git a/chart/nginx.conf b/chart/nginx.conf new file mode 100644 index 00000000..129b8937 --- /dev/null +++ b/chart/nginx.conf @@ -0,0 +1,67 @@ +worker_processes 1; +error_log stderr; +pid /var/run/nginx.pid; +events { + worker_connections 1024; +} + +http { + include /etc/nginx/mime.types; + default_type application/octet-stream; + log_format main '$remote_addr - $remote_user [$time_local] "$request" ' + '$status $body_bytes_sent "$http_referer" ' + '"$http_user_agent" "$http_x_forwarded_for"'; + access_log /dev/stdout; + sendfile on; + keepalive_timeout 65; + include ./resolvers/resolvers.conf; + server { + listen 80 default_server; + server_name _; + proxy_buffering off; + proxy_buffers 16 64k; + proxy_buffer_size 64k; + root /usr/share/nginx/html; + index index.html index.htm; + error_page 500 501 502 503 504 /50x.html; + merge_slashes off; + location = /50x.html { + root /usr/share/nginx/html; + } + + location ~* /watch/([^/]+)/([^/]+)/ws { + set $archive $1; + set $crawl $2; + #auth_request /authcheck; + + proxy_pass http://$2.crawlers.svc.cluster.local:9037/ws; + proxy_set_header Host "localhost"; + + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $http_connection; + } + + location ~* /watch/([^/]+)/([^/]+)/ { + set $archive $1; + set $crawl $2; + #auth_request /authcheck; + + proxy_pass http://$2.crawlers.svc.cluster.local:9037/; + proxy_set_header Host "localhost"; + } + + location = /authcheck { + internal; + proxy_pass http://localhost:8000/archives/$archive/crawls/$crawl; + proxy_pass_request_body off; + proxy_set_header Content-Length ""; + } + + location / { + proxy_pass http://localhost:8000/; + proxy_set_header Host $host; + } + } +} + diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml new file mode 100644 index 00000000..8f623ed5 --- /dev/null +++ b/chart/templates/configmap.yaml @@ -0,0 +1,40 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.name }}-env-config + namespace: {{ .Release.Namespace }} + +data: + MONGO_HOST: {{ .Values.mongo_host }} + + CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }} + CRAWLER_IMAGE: {{ .Values.crawler_image }} + + CRAWL_TIMEOUT: "{{ .Values.crawl_timeout }}" + CRAWL_RETRIES: "{{ .Values.crawl_retries }}" + + REDIS_URL: "{{ .Values.redis_url }}" + + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: shared-crawler-config + namespace: {{ .Values.crawler_namespace }} + +data: + CRAWL_ARGS: "{{ .Values.crawler_args }} --redisStoreUrl {{ .Values.redis_url }}" + WEBHOOK_URL: "http://browsertrix-cloud.default/_crawls/done" + STORE_USER: "" + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: nginx-config + namespace: {{ .Release.Namespace }} + +data: +{{ (.Files.Glob "*.conf").AsConfig | indent 2 }} diff --git a/chart/templates/ingress.yaml b/chart/templates/ingress.yaml index f0b4ad3a..c3999ef8 100644 --- a/chart/templates/ingress.yaml +++ b/chart/templates/ingress.yaml @@ -1,43 +1,50 @@ - -# Ingress - only add ingress if 'host' is defined -{{ if .Values.ingress.host }} +{{- if .Values.ingress.host }} --- -apiVersion: networking.k8s.io/v1beta1 +apiVersion: networking.k8s.io/v1 kind: Ingress metadata: - name: ingress-{{ .Values.name }} + name: ingress-main namespace: {{ .Release.Namespace }} annotations: kubernetes.io/ingress.class: "nginx" + #nginx.ingress.kubernetes.io/rewrite-target: /$1 nginx.ingress.kubernetes.io/enable-cors: "true" nginx.ingress.kubernetes.io/cors-allow-origin: "*" - cert-manager.io/cluster-issuer: "cert-{{ .Values.name }}" + {{- if .Values.ingress.tls }} + cert-manager.io/cluster-issuer: "cert-main" + {{- end }} nginx.ingress.kubernetes.io/upstream-vhost: "{{ .Values.ingress.host }}" nginx.ingress.kubernetes.io/configuration-snippet: | - proxy_set_header X-Forwarded-Proto {{ .Values.scheme | default "https" }}; + proxy_set_header X-Forwarded-Proto {{ .Values.ingress.scheme | default "https" }}; spec: + {{- if .Values.ingress.tls }} tls: - hosts: - {{ .Values.ingress.host }} - secretName: cert-{{ .Values.name }} + secretName: cert-main + {{- end }} rules: - host: {{ .Values.ingress.host }} http: paths: - path: / + pathType: Prefix backend: - serviceName: {{ .Values.name }} - servicePort: 80 + service: + name: browsertrix-cloud + port: + number: 80 +{{ if .Values.ingress.tls }} --- apiVersion: cert-manager.io/v1alpha2 kind: ClusterIssuer metadata: - name: cert-{{ .Values.name }} + name: cert-main namespace: cert-manager spec: acme: @@ -47,7 +54,7 @@ spec: email: {{ .Values.ingress.cert_email }} # Name of a secret used to store the ACME account private key privateKeySecretRef: - name: cert-{{ .Values.name }} + name: cert-main # Enable the HTTP-01 challenge provider solvers: - http01: @@ -55,3 +62,4 @@ spec: class: nginx {{ end }} +{{ end }} diff --git a/chart/templates/main.yaml b/chart/templates/main.yaml index e7f84ed5..c31f2905 100644 --- a/chart/templates/main.yaml +++ b/chart/templates/main.yaml @@ -20,6 +20,16 @@ spec: "helm.update": {{ randAlphaNum 5 | quote }} spec: + volumes: + - name: nginx-config + configMap: + name: nginx-config + items: + - key: nginx.conf + path: nginx.conf + + - name: nginx-resolver + emptyDir: {} {{- if .Values.minio_local }} initContainers: @@ -37,7 +47,41 @@ spec: args: ['-c', 'mc mb local/test-bucket; mc policy set public local/test-bucket' ] {{- end }} + - name: init-nginx + image: {{ .Values.nginx_image }} + command: ["/bin/sh"] + args: ["-c", "echo resolver $(awk 'BEGIN{ORS=\" \"} $1==\"nameserver\" {print $2}' /etc/resolv.conf) valid=30s \";\" > /etc/nginx/resolvers/resolvers.conf"] + volumeMounts: + - name: nginx-resolver + mountPath: /etc/nginx/resolvers/ + + containers: + - name: nginx + image: {{ .Values.nginx_image }} + imagePullPolicy: {{ .Values.nginx_pull_policy }} + volumeMounts: + - name: nginx-config + mountPath: /etc/nginx/nginx.conf + subPath: nginx.conf + readOnly: true + + - name: nginx-resolver + mountPath: /etc/nginx/resolvers/ + readOnly: true + + resources: + limits: + cpu: {{ .Values.nginx_limit_cpu }} + + requests: + cpu: {{ .Values.nginx_requests_cpu }} + + readinessProbe: + httpGet: + path: /docs + port: 80 + - name: api image: {{ .Values.api_image }} imagePullPolicy: {{ .Values.api_pull_policy }} @@ -56,7 +100,10 @@ spec: cpu: {{ .Values.api_requests_cpu }} memory: {{ .Values.api_requests_memory }} - + readinessProbe: + httpGet: + path: /docs + port: 8000 --- apiVersion: v1 @@ -89,7 +136,7 @@ spec: ports: - protocol: TCP - port: 8000 + port: 80 name: api #externalIPs: # - 127.0.0.1 diff --git a/chart/templates/redis.yaml b/chart/templates/redis.yaml new file mode 100644 index 00000000..458fb5c8 --- /dev/null +++ b/chart/templates/redis.yaml @@ -0,0 +1,59 @@ +{{- if .Values.redis_local }} + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: local-redis + namespace: {{ .Release.Namespace }} +spec: + selector: + matchLabels: + app: local-redis + + replicas: 1 + template: + metadata: + labels: + app: local-redis + + spec: + volumes: + - name: data-storage + hostPath: + path: /tmp/browsertrix-redis-data + type: DirectoryOrCreate + + containers: + - name: redis + image: {{ .Values.redis_image }} + imagePullPolicy: {{ .Values.redis_pull_policy }} + + args: ["--appendonly", "yes"] + volumeMounts: + - name: data-storage + mountPath: /data + +--- +apiVersion: v1 +kind: Service + +metadata: + namespace: {{ .Release.Namespace }} + name: local-redis + labels: + app: local-redis + +spec: + type: ClusterIP + selector: + app: local-redis + + ports: + - protocol: TCP + port: 6379 + name: redis + +{{- end }} + + diff --git a/chart/templates/secrets.yaml b/chart/templates/secrets.yaml new file mode 100644 index 00000000..bf569db8 --- /dev/null +++ b/chart/templates/secrets.yaml @@ -0,0 +1,36 @@ +--- +apiVersion: v1 +kind: Secret +metadata: + name: auth-secrets + namespace: {{ .Release.Namespace }} + +type: Opaque +stringData: + PASSWORD_SECRET: "{{ .Values.api_password_secret }}" + +{{- if .Values.minio_local }} +{{- with (first .Values.storages) }} + MINIO_ROOT_USER: "{{ .access_key }}" + MINIO_ROOT_PASSWORD: "{{ .secret_key }}" + + MC_HOST: "{{ $.Values.minio_scheme }}://{{ .access_key }}:{{ .secret_key }}@{{ $.Values.minio_host }}" +{{- end }} +{{- end }} + + +{{- range $storage := .Values.storages }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: storage-{{ $storage.name }} + namespace: {{ $.Values.crawler_namespace }} + +type: Opaque +stringData: + STORE_ACCESS_KEY: "{{ $storage.access_key }}" + STORE_SECRET_KEY: "{{ $storage.secret_key }}" + STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}" + +{{- end }} diff --git a/chart/values.yaml b/chart/values.yaml index c1428ecd..50716c2f 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -16,6 +16,15 @@ api_requests_memory: "100M" api_limits_memory: "256M" +# Nginx Image +# ========================================= +nginx_image: "nginx" +nginx_pull_policy: "IfNotPresent" + +nginx_limit_cpu: "100m" +nginx_requests_cpu: "25m" + + # MongoDB Image # ========================================= mongo_local: true @@ -56,7 +65,7 @@ crawler_namespace: "crawlers" crawl_retries: 1 # browsertrix-crawler args: -crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ" +crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037 --headless" @@ -90,8 +99,11 @@ minio_pull_policy: "IfNotPresent" # Ingress (Optional) # Optional: if 'host' is set, a publicly accessible Ingress controller is created with an SSL cert (using letsencrypt) ingress: - host: "" - cert_email: "ikreymer@gmail.com" + host: "btrix.cloud" + cert_email: "test@example.com" + scheme: "http" + tls: false + diff --git a/config.sample.env b/configs/config.sample.env similarity index 100% rename from config.sample.env rename to configs/config.sample.env diff --git a/docker-compose.yml b/docker-compose.yml index 89dc8700..5a1709c6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,7 +11,7 @@ services: - /var/run/docker.sock:/var/run/docker.sock env_file: - - ./config.env + - ./configs/config.env depends_on: - minio @@ -33,7 +33,7 @@ services: - btrix-mongo-data:/data/db env_file: - - ./config.env + - ./configs/config.env minio: image: minio/minio @@ -46,7 +46,7 @@ services: - btrix-minio-data:/data env_file: - - ./config.env + - ./configs/config.env init_minio_bucket: image: minio/mc @@ -54,7 +54,7 @@ services: command: ['-c', 'mc mb local/test-bucket; mc policy set public local/test-bucket' ] env_file: - - ./config.env + - ./configs/config.env depends_on: - minio