From c55e6ef86dae95742da971d1a0412ebd6493e11d Mon Sep 17 00:00:00 2001
From: Neverdecel <f.dewit@live.nl>
Date: Tue, 16 Jun 2026 21:02:40 +0200
Subject: [PATCH] feat(chart): configurable UI rollout strategy + 409-tolerant
 reindex

The UI Deployment hardcoded strategy: Recreate. With a single replica that
is correct for the single-writer ReadWriteOnce index, but it means every image
change tears the old pod down before the new one is Ready, leaving the ingress
with no backend for the pull+boot window (surfaces as a 502 / "no available
server"). On a deployment that auto-updates on each beta image, the public UI
flaps on every build.

- Make ui.strategy configurable (default unchanged: Recreate). Operators whose
  volume tolerates same-node multi-attach (k3s local-path, RWM) AND whose UI is
  read-only during the overlap (demo mode) can opt into a zero-surge
  RollingUpdate for seamless rollouts. Worst case on a misjudged volume is a
  stalled-but-up rollout, never an outage.
- reindex CronJob: treat HTTP 409 (a build already in progress) as a benign
  no-op instead of a hard curl failure, so a periodic refresh overlapping the
  per-upgrade init Job no longer fails the Job and piles up Error pods via
  backoff retries. Non-2xx still fails.
- Bump chart 0.1.1 -> 0.1.2.
---
 deploy/helm/coderag/Chart.yaml                |  2 +-
 .../coderag/templates/reindex-cronjob.yaml    | 19 +++++++++++++++---
 .../helm/coderag/templates/ui-deployment.yaml |  2 +-
 deploy/helm/coderag/values.yaml               | 20 +++++++++++++++++++
 4 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/deploy/helm/coderag/Chart.yaml b/deploy/helm/coderag/Chart.yaml
index 012747a..5a668bb 100644
--- a/deploy/helm/coderag/Chart.yaml
+++ b/deploy/helm/coderag/Chart.yaml
@@ -7,7 +7,7 @@ description: >-
 type: application
 
 # Chart version — bump on every chart change (independent of the app version).
-version: 0.1.1
+version: 0.1.2
 
 # Version of CodeRAG this chart deploys by default. No versioned container images
 # are published yet, so the default image tag is the rolling `:beta` channel; pin
diff --git a/deploy/helm/coderag/templates/reindex-cronjob.yaml b/deploy/helm/coderag/templates/reindex-cronjob.yaml
index e58727b..5169149 100644
--- a/deploy/helm/coderag/templates/reindex-cronjob.yaml
+++ b/deploy/helm/coderag/templates/reindex-cronjob.yaml
@@ -59,11 +59,24 @@ spec:
                     AUTH="Authorization: Bearer ${CODERAG_API_KEY}"
                   fi
                   echo "Reindex (full=$FULL) via $CODERAG_URL ..."
-                  curl -fsS -X POST ${AUTH:+-H "$AUTH"} "$CODERAG_URL/index" \
+                  # -w writes the HTTP status on its own line; --fail-with-body keeps the
+                  # body on 4xx/5xx so we can act on the code instead of just exiting.
+                  code=$(curl -sS -o /dev/stderr -w '%{http_code}' \
+                    -X POST ${AUTH:+-H "$AUTH"} "$CODERAG_URL/index" \
                     -H 'content-type: application/json' \
-                    -d "{\"full\": $FULL}"
+                    -d "{\"full\": $FULL}")
                   echo
-                  echo "Reindex request complete."
+                  # 409 = an index build is already running (e.g. the per-upgrade init
+                  # Job, or an overlapping run). That is a benign no-op for a periodic
+                  # refresh, not a failure — don't fail the Job and trigger backoff retries.
+                  if [ "$code" = "409" ]; then
+                    echo "Reindex skipped: a build is already in progress (HTTP 409)."
+                    exit 0
+                  fi
+                  case "$code" in
+                    2*) echo "Reindex request complete (HTTP $code)." ;;
+                    *)  echo "Reindex failed (HTTP $code)." >&2; exit 1 ;;
+                  esac
               resources:
                 {{- toYaml .Values.index.resources | nindent 16 }}
               volumeMounts:
diff --git a/deploy/helm/coderag/templates/ui-deployment.yaml b/deploy/helm/coderag/templates/ui-deployment.yaml
index 14f9ad7..6f59184 100644
--- a/deploy/helm/coderag/templates/ui-deployment.yaml
+++ b/deploy/helm/coderag/templates/ui-deployment.yaml
@@ -10,7 +10,7 @@ spec:
   # The UI bundles the engine and writes its own index — single writer, one replica.
   replicas: 1
   strategy:
-    type: Recreate
+    {{- toYaml .Values.ui.strategy | nindent 4 }}
   selector:
     matchLabels:
       {{- include "coderag.selectorLabels" . | nindent 6 }}
diff --git a/deploy/helm/coderag/values.yaml b/deploy/helm/coderag/values.yaml
index 1e6f5ed..906a020 100644
--- a/deploy/helm/coderag/values.yaml
+++ b/deploy/helm/coderag/values.yaml
@@ -180,6 +180,26 @@ server:
 ui:
   enabled: false
   containerPort: 8501
+  # -- Deployment update strategy. Defaults to Recreate: the UI is a single writer on
+  # a ReadWriteOnce volume, so the old pod must release the claim before the new one
+  # binds it. The cost is a brief gap with no Ready pod on every image change — visible
+  # behind an ingress as a 502 / "no available server".
+  #
+  # Switch to a zero-surge RollingUpdate to make image rollouts seamless (new pod goes
+  # Ready before the old one is removed) ONLY when BOTH hold:
+  #   1. the volume tolerates two pods mounting it at once — same-node RWO (e.g. k3s
+  #      local-path, where the surge pod lands on the same node) or a ReadWriteMany
+  #      class — otherwise the surge pod is stuck Pending and the rollout stalls; and
+  #   2. there are no concurrent index writes during the overlap (the UI only writes on
+  #      Reindex, so a read-only / demo-mode UI is safe; a UI actively reindexing is not).
+  # Worst case if (1) is misjudged is a stalled-but-still-up rollout, never an outage.
+  #   strategy:
+  #     type: RollingUpdate
+  #     rollingUpdate:
+  #       maxUnavailable: 0
+  #       maxSurge: 1
+  strategy:
+    type: Recreate
   service:
     type: ClusterIP
     port: 8501