From 804f755787dca95df588923c8545fa224a180e1b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 28 Feb 2024 14:22:33 -0800 Subject: [PATCH] Increase startup probe time to account for long-running migrations (#1560) - increases the failureThreshold for startupProbe for the api backend container to account for long running migrations, upto 300 seconds - add `/healthzStartup` which checks if db is ready - bump - keeps `/healthz` to always return 200 when running - increases livenessProbe failureThreshold to be higher than readiness probe, following recommended best practice of liveness probe > readiness probe - fixes #1559 --- backend/btrixcloud/main.py | 12 ++++++++++-- chart/templates/backend.yaml | 11 +++++------ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 752a0446..4afd8afe 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -189,12 +189,20 @@ def main(): async def openapi() -> JSONResponse: return JSONResponse(app_root.openapi()) - @app_root.get("/healthz", include_in_schema=False) - async def healthz(): + # Used for startup + # Returns 200 only when db is available + migrations are done + @app_root.get("/healthzStartup", include_in_schema=False) + async def healthz_startup(): if not db_inited.get("inited"): raise HTTPException(status_code=503, detail="not_ready_yet") return {} + # Used for readiness + liveness + # Always returns 200 while running + @app_root.get("/healthz", include_in_schema=False) + async def healthz(): + return {} + app_root.include_router(app, prefix=API_PREFIX) diff --git a/chart/templates/backend.yaml b/chart/templates/backend.yaml index 1c8ce665..dd148d63 100644 --- a/chart/templates/backend.yaml +++ b/chart/templates/backend.yaml @@ -97,11 +97,10 @@ spec: startupProbe: httpGet: - path: /healthz + path: /healthzStartup port: 8000 - initialDelaySeconds: 5 periodSeconds: 5 - failureThreshold: 30 + failureThreshold: 60 successThreshold: 1 readinessProbe: @@ -119,7 +118,7 @@ spec: port: 8000 initialDelaySeconds: 5 periodSeconds: 30 - failureThreshold: 5 + failureThreshold: 15 successThreshold: 1 - name: op @@ -176,7 +175,7 @@ spec: port: {{ .Values.opPort }} initialDelaySeconds: 5 periodSeconds: 5 - failureThreshold: 30 + failureThreshold: 5 successThreshold: 1 readinessProbe: @@ -194,7 +193,7 @@ spec: port: {{ .Values.opPort }} initialDelaySeconds: 5 periodSeconds: 30 - failureThreshold: 5 + failureThreshold: 15 successThreshold: 1