From c55e6ef86dae95742da971d1a0412ebd6493e11d Mon Sep 17 00:00:00 2001 From: Neverdecel Date: Tue, 16 Jun 2026 21:02:40 +0200 Subject: [PATCH] feat(chart): configurable UI rollout strategy + 409-tolerant reindex The UI Deployment hardcoded strategy: Recreate. With a single replica that is correct for the single-writer ReadWriteOnce index, but it means every image change tears the old pod down before the new one is Ready, leaving the ingress with no backend for the pull+boot window (surfaces as a 502 / "no available server"). On a deployment that auto-updates on each beta image, the public UI flaps on every build. - Make ui.strategy configurable (default unchanged: Recreate). Operators whose volume tolerates same-node multi-attach (k3s local-path, RWM) AND whose UI is read-only during the overlap (demo mode) can opt into a zero-surge RollingUpdate for seamless rollouts. Worst case on a misjudged volume is a stalled-but-up rollout, never an outage. - reindex CronJob: treat HTTP 409 (a build already in progress) as a benign no-op instead of a hard curl failure, so a periodic refresh overlapping the per-upgrade init Job no longer fails the Job and piles up Error pods via backoff retries. Non-2xx still fails. - Bump chart 0.1.1 -> 0.1.2. --- deploy/helm/coderag/Chart.yaml | 2 +- .../coderag/templates/reindex-cronjob.yaml | 19 +++++++++++++++--- .../helm/coderag/templates/ui-deployment.yaml | 2 +- deploy/helm/coderag/values.yaml | 20 +++++++++++++++++++ 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/deploy/helm/coderag/Chart.yaml b/deploy/helm/coderag/Chart.yaml index 012747a..5a668bb 100644 --- a/deploy/helm/coderag/Chart.yaml +++ b/deploy/helm/coderag/Chart.yaml @@ -7,7 +7,7 @@ description: >- type: application # Chart version — bump on every chart change (independent of the app version). -version: 0.1.1 +version: 0.1.2 # Version of CodeRAG this chart deploys by default. No versioned container images # are published yet, so the default image tag is the rolling `:beta` channel; pin diff --git a/deploy/helm/coderag/templates/reindex-cronjob.yaml b/deploy/helm/coderag/templates/reindex-cronjob.yaml index e58727b..5169149 100644 --- a/deploy/helm/coderag/templates/reindex-cronjob.yaml +++ b/deploy/helm/coderag/templates/reindex-cronjob.yaml @@ -59,11 +59,24 @@ spec: AUTH="Authorization: Bearer ${CODERAG_API_KEY}" fi echo "Reindex (full=$FULL) via $CODERAG_URL ..." - curl -fsS -X POST ${AUTH:+-H "$AUTH"} "$CODERAG_URL/index" \ + # -w writes the HTTP status on its own line; --fail-with-body keeps the + # body on 4xx/5xx so we can act on the code instead of just exiting. + code=$(curl -sS -o /dev/stderr -w '%{http_code}' \ + -X POST ${AUTH:+-H "$AUTH"} "$CODERAG_URL/index" \ -H 'content-type: application/json' \ - -d "{\"full\": $FULL}" + -d "{\"full\": $FULL}") echo - echo "Reindex request complete." + # 409 = an index build is already running (e.g. the per-upgrade init + # Job, or an overlapping run). That is a benign no-op for a periodic + # refresh, not a failure — don't fail the Job and trigger backoff retries. + if [ "$code" = "409" ]; then + echo "Reindex skipped: a build is already in progress (HTTP 409)." + exit 0 + fi + case "$code" in + 2*) echo "Reindex request complete (HTTP $code)." ;; + *) echo "Reindex failed (HTTP $code)." >&2; exit 1 ;; + esac resources: {{- toYaml .Values.index.resources | nindent 16 }} volumeMounts: diff --git a/deploy/helm/coderag/templates/ui-deployment.yaml b/deploy/helm/coderag/templates/ui-deployment.yaml index 14f9ad7..6f59184 100644 --- a/deploy/helm/coderag/templates/ui-deployment.yaml +++ b/deploy/helm/coderag/templates/ui-deployment.yaml @@ -10,7 +10,7 @@ spec: # The UI bundles the engine and writes its own index — single writer, one replica. replicas: 1 strategy: - type: Recreate + {{- toYaml .Values.ui.strategy | nindent 4 }} selector: matchLabels: {{- include "coderag.selectorLabels" . | nindent 6 }} diff --git a/deploy/helm/coderag/values.yaml b/deploy/helm/coderag/values.yaml index 1e6f5ed..906a020 100644 --- a/deploy/helm/coderag/values.yaml +++ b/deploy/helm/coderag/values.yaml @@ -180,6 +180,26 @@ server: ui: enabled: false containerPort: 8501 + # -- Deployment update strategy. Defaults to Recreate: the UI is a single writer on + # a ReadWriteOnce volume, so the old pod must release the claim before the new one + # binds it. The cost is a brief gap with no Ready pod on every image change — visible + # behind an ingress as a 502 / "no available server". + # + # Switch to a zero-surge RollingUpdate to make image rollouts seamless (new pod goes + # Ready before the old one is removed) ONLY when BOTH hold: + # 1. the volume tolerates two pods mounting it at once — same-node RWO (e.g. k3s + # local-path, where the surge pod lands on the same node) or a ReadWriteMany + # class — otherwise the surge pod is stuck Pending and the rollout stalls; and + # 2. there are no concurrent index writes during the overlap (the UI only writes on + # Reindex, so a read-only / demo-mode UI is safe; a UI actively reindexing is not). + # Worst case if (1) is misjudged is a stalled-but-still-up rollout, never an outage. + # strategy: + # type: RollingUpdate + # rollingUpdate: + # maxUnavailable: 0 + # maxSurge: 1 + strategy: + type: Recreate service: type: ClusterIP port: 8501