add ingress + nginx container for better routing

support screencasting to dynamically created service via nginx (k8s only thus far)
add crawl /watch endpoint to enable watching, creates service if doesn't exist
add crawl /running endpoint to check if crawl is running
nginx auth check in place, but not yet enabled
add k8s nginx.conf
add missing chart files
file reorg: move docker config to configs/
k8s: add readiness check for nginx and api containers for smoother reloading
ensure service deleted along with job
todo: update dockerman with screencast support
This commit is contained in:
Ilya Kreymer 2021-10-09 22:41:01 -07:00
parent 19879fe349
commit 4ae4005d74
14 changed files with 370 additions and 37 deletions

2
.gitignore vendored
View File

@ -1,3 +1,3 @@
**/*.pyc **/*.pyc
**/node_modules/ **/node_modules/
config.env **/config.env

View File

@ -135,11 +135,11 @@ class CrawlOps:
crawlconfig = CrawlConfig.from_dict(data) crawlconfig = CrawlConfig.from_dict(data)
await self.crawl_manager.add_crawl_config( new_name = await self.crawl_manager.add_crawl_config(
crawlconfig=crawlconfig, storage=archive.storage crawlconfig=crawlconfig, storage=archive.storage
) )
return result return result, new_name
async def update_crawl_schedule(self, cid: str, update: UpdateSchedule): async def update_crawl_schedule(self, cid: str, update: UpdateSchedule):
""" Update schedule for existing crawl config""" """ Update schedule for existing crawl config"""
@ -216,8 +216,8 @@ def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager):
archive: Archive = Depends(archive_crawl_dep), archive: Archive = Depends(archive_crawl_dep),
user: User = Depends(user_dep), user: User = Depends(user_dep),
): ):
res = await ops.add_crawl_config(config, archive, user) res, new_job_name = await ops.add_crawl_config(config, archive, user)
return {"added": str(res.inserted_id)} return {"added": str(res.inserted_id), "run_now_job": new_job_name}
@router.patch("/{cid}/schedule", dependencies=[Depends(archive_crawl_dep)]) @router.patch("/{cid}/schedule", dependencies=[Depends(archive_crawl_dep)])
async def update_crawl_schedule( async def update_crawl_schedule(

View File

@ -5,7 +5,7 @@ import asyncio
from typing import Optional, List, Dict from typing import Optional, List, Dict
from datetime import datetime from datetime import datetime
from fastapi import Depends, HTTPException from fastapi import Depends, Request, HTTPException
from pydantic import BaseModel from pydantic import BaseModel
import pymongo import pymongo
import aioredis import aioredis
@ -197,7 +197,7 @@ class CrawlOps:
# ============================================================================ # ============================================================================
# pylint: disable=too-many-arguments # pylint: disable=too-many-arguments, too-many-locals
def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archives): def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archives):
""" API for crawl management, including crawl done callback""" """ API for crawl management, including crawl done callback"""
@ -276,6 +276,16 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv
return {"deleted": res} return {"deleted": res}
@app.get(
"/archives/{aid}/crawls/{crawl_id}/running",
tags=["crawls"],
)
async def get_running(crawl_id, archive: Archive = Depends(archive_crawl_dep)):
if not crawl_manager.is_running(crawl_id, archive.id):
raise HTTPException(status_code=404, detail="No Such Crawl")
return {"running": True}
@app.post( @app.post(
"/archives/{aid}/crawls/{crawl_id}/scale", "/archives/{aid}/crawls/{crawl_id}/scale",
tags=["crawls"], tags=["crawls"],
@ -289,3 +299,11 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv
raise HTTPException(status_code=400, detail=error) raise HTTPException(status_code=400, detail=error)
return {"scaled": scale.scale} return {"scaled": scale.scale}
@app.post("/archives/{aid}/crawls/{crawl_id}/watch", tags=["crawls"])
async def watch_crawl(
crawl_id, request: Request, archive: Archive = Depends(archive_crawl_dep)
):
await crawl_manager.init_crawl_screencast(crawl_id, archive.id)
watch_url = f"{request.url.scheme}://{request.url.netloc}/watch/{archive.id}/{crawl_id}/ws"
return {"watch_url": watch_url}

View File

@ -216,9 +216,10 @@ class K8SManager:
# Run Job Now # Run Job Now
if run_now: if run_now:
await self._create_run_now_job(cron_job) new_job = await self._create_run_now_job(cron_job)
return new_job.metadata.name
return cron_job return ""
async def update_crawl_schedule(self, cid, schedule): async def update_crawl_schedule(self, cid, schedule):
""" Update the schedule for existing crawl config """ """ Update the schedule for existing crawl config """
@ -282,6 +283,31 @@ class K8SManager:
if job.status.active if job.status.active
] ]
async def init_crawl_screencast(self, crawl_id, aid):
""" Init service for this job/crawl_id to support screencasting """
labels = {"btrix.archive": aid}
service = client.V1Service(
kind="Service",
api_version="v1",
metadata={
"name": crawl_id,
"labels": labels,
},
spec={
"selector": {"job-name": crawl_id},
"ports": [{"protocol": "TCP", "port": 9037, "name": "screencast"}],
},
)
try:
await self.core_api.create_namespaced_service(
body=service, namespace=self.namespace
)
except client.exceptions.ApiException as api_exc:
if api_exc.status != 409:
raise api_exc
async def process_crawl_complete(self, crawlcomplete): async def process_crawl_complete(self, crawlcomplete):
"""Ensure the crawlcomplete data is valid (job exists and user matches) """Ensure the crawlcomplete data is valid (job exists and user matches)
Fill in additional details about the crawl""" Fill in additional details about the crawl"""
@ -315,6 +341,21 @@ class K8SManager:
return crawl, crawl_file return crawl, crawl_file
async def is_running(self, job_name, aid):
""" Return true if the specified crawl (by job_name) is running """
try:
job = await self.batch_api.read_namespaced_job(
name=job_name, namespace=self.namespace
)
if not job or job.metadata.labels["btrix.archive"] != aid:
return False
return True
# pylint: disable=broad-except
except Exception:
return False
async def stop_crawl(self, job_name, aid, graceful=True): async def stop_crawl(self, job_name, aid, graceful=True):
"""Attempt to stop crawl, either gracefully by issuing a SIGTERM which """Attempt to stop crawl, either gracefully by issuing a SIGTERM which
will attempt to finish current pages will attempt to finish current pages
@ -435,6 +476,17 @@ class K8SManager:
propagation_policy="Foreground", propagation_policy="Foreground",
) )
try:
await self.core_api.delete_namespaced_service(
name=name,
namespace=self.namespace,
grace_period_seconds=60,
propagation_policy="Foreground",
)
# pylint: disable=bare-except
except:
pass
def _create_config_map(self, crawlconfig, labels): def _create_config_map(self, crawlconfig, labels):
""" Create Config Map based on CrawlConfig + labels """ """ Create Config Map based on CrawlConfig + labels """
config_map = client.V1ConfigMap( config_map = client.V1ConfigMap(
@ -498,12 +550,6 @@ class K8SManager:
propagation_policy="Foreground", propagation_policy="Foreground",
) )
# await self.core_api.delete_collection_namespaced_secret(
# namespace=self.namespace,
# label_selector=label,
# propagation_policy="Foreground",
# )
await self.core_api.delete_collection_namespaced_config_map( await self.core_api.delete_collection_namespaced_config_map(
namespace=self.namespace, namespace=self.namespace,
label_selector=label, label_selector=label,
@ -527,7 +573,6 @@ class K8SManager:
ts_now = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") ts_now = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
name = f"crawl-now-{ts_now}-{cron_job.metadata.labels['btrix.crawlconfig']}" name = f"crawl-now-{ts_now}-{cron_job.metadata.labels['btrix.crawlconfig']}"
print("NAME", name, flush=True)
object_meta = client.V1ObjectMeta( object_meta = client.V1ObjectMeta(
name=name, name=name,

View File

@ -44,6 +44,7 @@ class User(models.BaseUser):
Base User Model Base User Model
""" """
# ============================================================================ # ============================================================================
class UserCreate(models.BaseUserCreate): class UserCreate(models.BaseUserCreate):
""" """

67
chart/nginx.conf Normal file
View File

@ -0,0 +1,67 @@
worker_processes 1;
error_log stderr;
pid /var/run/nginx.pid;
events {
worker_connections 1024;
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';
access_log /dev/stdout;
sendfile on;
keepalive_timeout 65;
include ./resolvers/resolvers.conf;
server {
listen 80 default_server;
server_name _;
proxy_buffering off;
proxy_buffers 16 64k;
proxy_buffer_size 64k;
root /usr/share/nginx/html;
index index.html index.htm;
error_page 500 501 502 503 504 /50x.html;
merge_slashes off;
location = /50x.html {
root /usr/share/nginx/html;
}
location ~* /watch/([^/]+)/([^/]+)/ws {
set $archive $1;
set $crawl $2;
#auth_request /authcheck;
proxy_pass http://$2.crawlers.svc.cluster.local:9037/ws;
proxy_set_header Host "localhost";
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $http_connection;
}
location ~* /watch/([^/]+)/([^/]+)/ {
set $archive $1;
set $crawl $2;
#auth_request /authcheck;
proxy_pass http://$2.crawlers.svc.cluster.local:9037/;
proxy_set_header Host "localhost";
}
location = /authcheck {
internal;
proxy_pass http://localhost:8000/archives/$archive/crawls/$crawl;
proxy_pass_request_body off;
proxy_set_header Content-Length "";
}
location / {
proxy_pass http://localhost:8000/;
proxy_set_header Host $host;
}
}
}

View File

@ -0,0 +1,40 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ .Values.name }}-env-config
namespace: {{ .Release.Namespace }}
data:
MONGO_HOST: {{ .Values.mongo_host }}
CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }}
CRAWLER_IMAGE: {{ .Values.crawler_image }}
CRAWL_TIMEOUT: "{{ .Values.crawl_timeout }}"
CRAWL_RETRIES: "{{ .Values.crawl_retries }}"
REDIS_URL: "{{ .Values.redis_url }}"
---
apiVersion: v1
kind: ConfigMap
metadata:
name: shared-crawler-config
namespace: {{ .Values.crawler_namespace }}
data:
CRAWL_ARGS: "{{ .Values.crawler_args }} --redisStoreUrl {{ .Values.redis_url }}"
WEBHOOK_URL: "http://browsertrix-cloud.default/_crawls/done"
STORE_USER: ""
---
apiVersion: v1
kind: ConfigMap
metadata:
name: nginx-config
namespace: {{ .Release.Namespace }}
data:
{{ (.Files.Glob "*.conf").AsConfig | indent 2 }}

View File

@ -1,43 +1,50 @@
{{- if .Values.ingress.host }}
# Ingress - only add ingress if 'host' is defined
{{ if .Values.ingress.host }}
--- ---
apiVersion: networking.k8s.io/v1beta1 apiVersion: networking.k8s.io/v1
kind: Ingress kind: Ingress
metadata: metadata:
name: ingress-{{ .Values.name }} name: ingress-main
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
annotations: annotations:
kubernetes.io/ingress.class: "nginx" kubernetes.io/ingress.class: "nginx"
#nginx.ingress.kubernetes.io/rewrite-target: /$1
nginx.ingress.kubernetes.io/enable-cors: "true" nginx.ingress.kubernetes.io/enable-cors: "true"
nginx.ingress.kubernetes.io/cors-allow-origin: "*" nginx.ingress.kubernetes.io/cors-allow-origin: "*"
cert-manager.io/cluster-issuer: "cert-{{ .Values.name }}" {{- if .Values.ingress.tls }}
cert-manager.io/cluster-issuer: "cert-main"
{{- end }}
nginx.ingress.kubernetes.io/upstream-vhost: "{{ .Values.ingress.host }}" nginx.ingress.kubernetes.io/upstream-vhost: "{{ .Values.ingress.host }}"
nginx.ingress.kubernetes.io/configuration-snippet: | nginx.ingress.kubernetes.io/configuration-snippet: |
proxy_set_header X-Forwarded-Proto {{ .Values.scheme | default "https" }}; proxy_set_header X-Forwarded-Proto {{ .Values.ingress.scheme | default "https" }};
spec: spec:
{{- if .Values.ingress.tls }}
tls: tls:
- hosts: - hosts:
- {{ .Values.ingress.host }} - {{ .Values.ingress.host }}
secretName: cert-{{ .Values.name }} secretName: cert-main
{{- end }}
rules: rules:
- host: {{ .Values.ingress.host }} - host: {{ .Values.ingress.host }}
http: http:
paths: paths:
- path: / - path: /
pathType: Prefix
backend: backend:
serviceName: {{ .Values.name }} service:
servicePort: 80 name: browsertrix-cloud
port:
number: 80
{{ if .Values.ingress.tls }}
--- ---
apiVersion: cert-manager.io/v1alpha2 apiVersion: cert-manager.io/v1alpha2
kind: ClusterIssuer kind: ClusterIssuer
metadata: metadata:
name: cert-{{ .Values.name }} name: cert-main
namespace: cert-manager namespace: cert-manager
spec: spec:
acme: acme:
@ -47,7 +54,7 @@ spec:
email: {{ .Values.ingress.cert_email }} email: {{ .Values.ingress.cert_email }}
# Name of a secret used to store the ACME account private key # Name of a secret used to store the ACME account private key
privateKeySecretRef: privateKeySecretRef:
name: cert-{{ .Values.name }} name: cert-main
# Enable the HTTP-01 challenge provider # Enable the HTTP-01 challenge provider
solvers: solvers:
- http01: - http01:
@ -55,3 +62,4 @@ spec:
class: nginx class: nginx
{{ end }} {{ end }}
{{ end }}

View File

@ -20,6 +20,16 @@ spec:
"helm.update": {{ randAlphaNum 5 | quote }} "helm.update": {{ randAlphaNum 5 | quote }}
spec: spec:
volumes:
- name: nginx-config
configMap:
name: nginx-config
items:
- key: nginx.conf
path: nginx.conf
- name: nginx-resolver
emptyDir: {}
{{- if .Values.minio_local }} {{- if .Values.minio_local }}
initContainers: initContainers:
@ -37,7 +47,41 @@ spec:
args: ['-c', 'mc mb local/test-bucket; mc policy set public local/test-bucket' ] args: ['-c', 'mc mb local/test-bucket; mc policy set public local/test-bucket' ]
{{- end }} {{- end }}
- name: init-nginx
image: {{ .Values.nginx_image }}
command: ["/bin/sh"]
args: ["-c", "echo resolver $(awk 'BEGIN{ORS=\" \"} $1==\"nameserver\" {print $2}' /etc/resolv.conf) valid=30s \";\" > /etc/nginx/resolvers/resolvers.conf"]
volumeMounts:
- name: nginx-resolver
mountPath: /etc/nginx/resolvers/
containers: containers:
- name: nginx
image: {{ .Values.nginx_image }}
imagePullPolicy: {{ .Values.nginx_pull_policy }}
volumeMounts:
- name: nginx-config
mountPath: /etc/nginx/nginx.conf
subPath: nginx.conf
readOnly: true
- name: nginx-resolver
mountPath: /etc/nginx/resolvers/
readOnly: true
resources:
limits:
cpu: {{ .Values.nginx_limit_cpu }}
requests:
cpu: {{ .Values.nginx_requests_cpu }}
readinessProbe:
httpGet:
path: /docs
port: 80
- name: api - name: api
image: {{ .Values.api_image }} image: {{ .Values.api_image }}
imagePullPolicy: {{ .Values.api_pull_policy }} imagePullPolicy: {{ .Values.api_pull_policy }}
@ -56,7 +100,10 @@ spec:
cpu: {{ .Values.api_requests_cpu }} cpu: {{ .Values.api_requests_cpu }}
memory: {{ .Values.api_requests_memory }} memory: {{ .Values.api_requests_memory }}
readinessProbe:
httpGet:
path: /docs
port: 8000
--- ---
apiVersion: v1 apiVersion: v1
@ -89,7 +136,7 @@ spec:
ports: ports:
- protocol: TCP - protocol: TCP
port: 8000 port: 80
name: api name: api
#externalIPs: #externalIPs:
# - 127.0.0.1 # - 127.0.0.1

View File

@ -0,0 +1,59 @@
{{- if .Values.redis_local }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: local-redis
namespace: {{ .Release.Namespace }}
spec:
selector:
matchLabels:
app: local-redis
replicas: 1
template:
metadata:
labels:
app: local-redis
spec:
volumes:
- name: data-storage
hostPath:
path: /tmp/browsertrix-redis-data
type: DirectoryOrCreate
containers:
- name: redis
image: {{ .Values.redis_image }}
imagePullPolicy: {{ .Values.redis_pull_policy }}
args: ["--appendonly", "yes"]
volumeMounts:
- name: data-storage
mountPath: /data
---
apiVersion: v1
kind: Service
metadata:
namespace: {{ .Release.Namespace }}
name: local-redis
labels:
app: local-redis
spec:
type: ClusterIP
selector:
app: local-redis
ports:
- protocol: TCP
port: 6379
name: redis
{{- end }}

View File

@ -0,0 +1,36 @@
---
apiVersion: v1
kind: Secret
metadata:
name: auth-secrets
namespace: {{ .Release.Namespace }}
type: Opaque
stringData:
PASSWORD_SECRET: "{{ .Values.api_password_secret }}"
{{- if .Values.minio_local }}
{{- with (first .Values.storages) }}
MINIO_ROOT_USER: "{{ .access_key }}"
MINIO_ROOT_PASSWORD: "{{ .secret_key }}"
MC_HOST: "{{ $.Values.minio_scheme }}://{{ .access_key }}:{{ .secret_key }}@{{ $.Values.minio_host }}"
{{- end }}
{{- end }}
{{- range $storage := .Values.storages }}
---
apiVersion: v1
kind: Secret
metadata:
name: storage-{{ $storage.name }}
namespace: {{ $.Values.crawler_namespace }}
type: Opaque
stringData:
STORE_ACCESS_KEY: "{{ $storage.access_key }}"
STORE_SECRET_KEY: "{{ $storage.secret_key }}"
STORE_ENDPOINT_URL: "{{ $storage.endpoint_url }}"
{{- end }}

View File

@ -16,6 +16,15 @@ api_requests_memory: "100M"
api_limits_memory: "256M" api_limits_memory: "256M"
# Nginx Image
# =========================================
nginx_image: "nginx"
nginx_pull_policy: "IfNotPresent"
nginx_limit_cpu: "100m"
nginx_requests_cpu: "25m"
# MongoDB Image # MongoDB Image
# ========================================= # =========================================
mongo_local: true mongo_local: true
@ -56,7 +65,7 @@ crawler_namespace: "crawlers"
crawl_retries: 1 crawl_retries: 1
# browsertrix-crawler args: # browsertrix-crawler args:
crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ" crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037 --headless"
@ -90,8 +99,11 @@ minio_pull_policy: "IfNotPresent"
# Ingress (Optional) # Ingress (Optional)
# Optional: if 'host' is set, a publicly accessible Ingress controller is created with an SSL cert (using letsencrypt) # Optional: if 'host' is set, a publicly accessible Ingress controller is created with an SSL cert (using letsencrypt)
ingress: ingress:
host: "" host: "btrix.cloud"
cert_email: "ikreymer@gmail.com" cert_email: "test@example.com"
scheme: "http"
tls: false

View File

@ -11,7 +11,7 @@ services:
- /var/run/docker.sock:/var/run/docker.sock - /var/run/docker.sock:/var/run/docker.sock
env_file: env_file:
- ./config.env - ./configs/config.env
depends_on: depends_on:
- minio - minio
@ -33,7 +33,7 @@ services:
- btrix-mongo-data:/data/db - btrix-mongo-data:/data/db
env_file: env_file:
- ./config.env - ./configs/config.env
minio: minio:
image: minio/minio image: minio/minio
@ -46,7 +46,7 @@ services:
- btrix-minio-data:/data - btrix-minio-data:/data
env_file: env_file:
- ./config.env - ./configs/config.env
init_minio_bucket: init_minio_bucket:
image: minio/mc image: minio/mc
@ -54,7 +54,7 @@ services:
command: ['-c', 'mc mb local/test-bucket; mc policy set public local/test-bucket' ] command: ['-c', 'mc mb local/test-bucket; mc policy set public local/test-bucket' ]
env_file: env_file:
- ./config.env - ./configs/config.env
depends_on: depends_on:
- minio - minio