From 7413b4cc1bd381cbfa62a2d0eb4ccc86767b05d5 Mon Sep 17 00:00:00 2001
From: ada mancini <ada@replicated.com>
Date: Wed, 25 Mar 2026 16:22:18 -0400
Subject: [PATCH 1/4] ci(mlflow): speed up k3s CI jobs

- Use 1-node k3s clusters instead of 3-node; GKE keeps 3 nodes
- Remove invalid skip-preflights and debug inputs from kots-install action
- Replace 5-minute service poll loop with kubectl wait --for=condition=Available
---
 .github/workflows/mlflow-ci.yml | 82 +++++++++++----------------------
 1 file changed, 27 insertions(+), 55 deletions(-)

diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml
index 39a54cf6..13717fe6 100644
--- a/.github/workflows/mlflow-ci.yml
+++ b/.github/workflows/mlflow-ci.yml
@@ -166,8 +166,10 @@ jobs:
         cluster:
           - distribution: k3s
             version: 1.32
+            nodes: 1
           - distribution: gke
             version: 1.32
+            nodes: 3
         config:
           - name: nodeport-ingress-disabled
             values_file: tests/helm/nodeport-ingress-disabled.yaml
@@ -229,7 +231,7 @@ jobs:
           kubernetes-version: ${{ matrix.cluster.version }}
           cluster-name: mlflow-ci-${{ github.run_id }}-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}-${{ matrix.config.name }}
           disk: 100
-          nodes: 3
+          nodes: ${{ matrix.cluster.nodes }}
           ttl: 1h
           export-kubeconfig: true
 
@@ -303,8 +305,10 @@ jobs:
         cluster:
           - distribution: k3s
             version: 1.32
+            nodes: 1
           - distribution: gke
             version: 1.32
+            nodes: 3
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -340,7 +344,7 @@ jobs:
           kubernetes-version: ${{ matrix.cluster.version }}
           cluster-name: mlflow-kots-${{ github.run_id }}-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}
           disk: 100
-          nodes: 3
+          nodes: ${{ matrix.cluster.nodes }}
           ttl: 1h
           export-kubeconfig: true
 
@@ -407,8 +411,6 @@ jobs:
           namespace: default
           wait-duration: 10m
           shared-password: 'replicatedmlflow'
-          skip-preflights: true
-          debug: true
 
       # Set up port forwarding after KOTS installation is complete
       - name: Set up port forwarding
@@ -416,63 +418,33 @@ jobs:
         run: |
           KUBECONFIG_FILE="/tmp/kubeconfig-kots-test-${{ github.run_id }}"
           echo "$KUBECONFIG" > "$KUBECONFIG_FILE"
-          echo "Saved kubeconfig to $KUBECONFIG_FILE"
           PORT="5000"
-          echo "Using port: $PORT for testing"
-          echo "Waiting for MLflow service to be created..."
-          MAX_RETRIES=30
-          RETRY_INTERVAL=10
-          RETRY_COUNT=0
-          SERVICE_FOUND=false
-          while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
-            echo "Check $((RETRY_COUNT+1))/$MAX_RETRIES: Looking for MLflow service..."
-            if KUBECONFIG="$KUBECONFIG_FILE" kubectl get svc mlflow -n default --no-headers 2>/dev/null; then
-              echo "MLflow service found!"
-              SERVICE_FOUND=true
-              break
-            else
-              echo "MLflow service not found yet. Waiting $RETRY_INTERVAL seconds..."
-              RETRY_COUNT=$((RETRY_COUNT+1))
-              sleep $RETRY_INTERVAL
-            fi
-          done
-          if [ "$SERVICE_FOUND" != "true" ]; then
-            echo "ERROR: MLflow service not found after $((MAX_RETRIES * RETRY_INTERVAL)) seconds."
-            echo "Showing all available services in the namespace:"
-            KUBECONFIG="$KUBECONFIG_FILE" kubectl get svc -n default
-            echo "Showing KOTS application status:"
-            KUBECONFIG="$KUBECONFIG_FILE" kubectl get app -n default || true
-            echo "Showing all pods in the namespace:"
+
+          echo "Waiting for MLflow deployment to be available..."
+          KUBECONFIG="$KUBECONFIG_FILE" kubectl wait deployment \
+            --selector=app.kubernetes.io/name=mlflow \
+            --for=condition=Available \
+            --timeout=5m \
+            -n default || {
+            echo "ERROR: MLflow deployment not available after 5m"
             KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -n default
+            KUBECONFIG="$KUBECONFIG_FILE" kubectl describe deployment -l app.kubernetes.io/name=mlflow -n default
             exit 1
-          fi
-          KUBECONFIG="$KUBECONFIG_FILE" kubectl get svc -n default
-          echo "Checking pod status..."
-          KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -n default
-          echo "Waiting for MLflow pods to be running..."
-          KUBECONFIG="$KUBECONFIG_FILE" kubectl wait --for=condition=Ready pods --selector=app.kubernetes.io/name=mlflow -n default --timeout=2m || {
-            echo "WARNING: Timed out waiting for pods to be ready, will try port-forwarding anyway"
-            KUBECONFIG="$KUBECONFIG_FILE" kubectl describe pods -n default
           }
-          echo "Setting up port forwarding to run in the background"
+
+          echo "Setting up port forwarding..."
           nohup bash -c "KUBECONFIG='$KUBECONFIG_FILE' kubectl port-forward -n default svc/mlflow $PORT:5000 &>/tmp/port-forward-kots-${{ github.run_id }}.log" &
-          PORT_FORWARD_PID=$!
-          echo "port_forward_pid=$PORT_FORWARD_PID" >> $GITHUB_OUTPUT
-          echo "Set up port forwarding with PID: $PORT_FORWARD_PID"
+          echo "port_forward_pid=$!" >> $GITHUB_OUTPUT
           echo "hostname=localhost:$PORT" >> $GITHUB_OUTPUT
-          echo "Test endpoint will be: localhost:$PORT"
-          echo "Waiting for port-forward to establish..."
-          sleep 15
-          echo "Checking connectivity to MLflow..."
-          if curl -s -o /dev/null -w "%{http_code}" http://localhost:$PORT/; then
-            echo "Successfully connected to MLflow service!"
-          else
-            echo "Warning: Initial connection attempt failed, service may still be starting"
-            echo "Port-forward log:"
-            cat /tmp/port-forward-kots-${{ github.run_id }}.log || true
-            echo "Pod logs:"
-            KUBECONFIG="$KUBECONFIG_FILE" kubectl logs -n default -l app.kubernetes.io/name=mlflow --tail=20 || true
-          fi
+
+          echo "Waiting for port-forward to become ready..."
+          for i in $(seq 1 10); do
+            if curl -sf http://localhost:$PORT/health >/dev/null 2>&1; then
+              echo "MLflow is reachable"
+              break
+            fi
+            sleep 3
+          done
         env:
           KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }}
 

From 50f5df7ae8a189e2521c9c524a3bc93236e0b4e3 Mon Sep 17 00:00:00 2001
From: ada mancini <ada@replicated.com>
Date: Wed, 25 Mar 2026 16:45:46 -0400
Subject: [PATCH 2/4] ci(mlflow): apply Copilot review suggestions

- Poll for deployment existence before kubectl wait (avoids immediate
  failure if KOTS hasn't created the Deployment yet)
- Add || true to kubectl diagnostics in error handler (prevents bash -e
  from swallowing the original error when describe returns non-zero)
- Fail port-forward step explicitly if MLflow never becomes reachable,
  with pod logs and port-forward log printed before exiting non-zero
---
 .github/workflows/mlflow-ci.yml | 36 +++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml
index 13717fe6..5824a2a8 100644
--- a/.github/workflows/mlflow-ci.yml
+++ b/.github/workflows/mlflow-ci.yml
@@ -420,6 +420,22 @@ jobs:
           echo "$KUBECONFIG" > "$KUBECONFIG_FILE"
           PORT="5000"
 
+          echo "Waiting for MLflow deployment to be created..."
+          for i in $(seq 1 30); do
+            if KUBECONFIG="$KUBECONFIG_FILE" kubectl get deployment -l app.kubernetes.io/name=mlflow -n default -o name 2>/dev/null | grep -q .; then
+              echo "MLflow deployment found."
+              break
+            fi
+            echo "MLflow deployment not found yet (attempt $i/30), waiting..."
+            sleep 10
+          done
+
+          if ! KUBECONFIG="$KUBECONFIG_FILE" kubectl get deployment -l app.kubernetes.io/name=mlflow -n default -o name 2>/dev/null | grep -q .; then
+            echo "ERROR: MLflow deployment was not created within the expected time"
+            KUBECONFIG="$KUBECONFIG_FILE" kubectl get deployments,pods -n default || true
+            exit 1
+          fi
+
           echo "Waiting for MLflow deployment to be available..."
           KUBECONFIG="$KUBECONFIG_FILE" kubectl wait deployment \
             --selector=app.kubernetes.io/name=mlflow \
@@ -427,8 +443,8 @@ jobs:
             --timeout=5m \
             -n default || {
             echo "ERROR: MLflow deployment not available after 5m"
-            KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -n default
-            KUBECONFIG="$KUBECONFIG_FILE" kubectl describe deployment -l app.kubernetes.io/name=mlflow -n default
+            KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -n default || true
+            KUBECONFIG="$KUBECONFIG_FILE" kubectl describe deployment -l app.kubernetes.io/name=mlflow -n default || true
             exit 1
           }
 
@@ -438,13 +454,29 @@ jobs:
           echo "hostname=localhost:$PORT" >> $GITHUB_OUTPUT
 
           echo "Waiting for port-forward to become ready..."
+          success=false
           for i in $(seq 1 10); do
             if curl -sf http://localhost:$PORT/health >/dev/null 2>&1; then
               echo "MLflow is reachable"
+              success=true
               break
             fi
             sleep 3
           done
+
+          if [ "$success" != "true" ]; then
+            echo "ERROR: MLflow never became reachable via port-forward after 30s"
+            echo "==== Port-forward log ===="
+            cat "/tmp/port-forward-kots-${{ github.run_id }}.log" || true
+            echo "==== MLflow pods ===="
+            KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -l app.kubernetes.io/name=mlflow -n default || true
+            pod_name="$(KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -l app.kubernetes.io/name=mlflow -n default -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")"
+            if [ -n "$pod_name" ]; then
+              echo "==== Logs for pod $pod_name ===="
+              KUBECONFIG="$KUBECONFIG_FILE" kubectl logs "$pod_name" -n default || true
+            fi
+            exit 1
+          fi
         env:
           KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }}
 

From 252afeb2383491d0c0320ddf8326948d10490d27 Mon Sep 17 00:00:00 2001
From: ada mancini <ada@replicated.com>
Date: Thu, 26 Mar 2026 17:49:55 -0400
Subject: [PATCH 3/4] ci(mlflow): fix KOTS port-forward race and reduce
 postgres instances

kubectl wait --for=condition=Available returns immediately for a
Deployment with maxUnavailable=1 and replicas=1 because 0 ready pods
satisfies the condition (0 >= 1-1). The pod was still Init:0/2 when
port-forward ran.

Replace with kubectl rollout status which correctly waits until all
desired replicas are Running and Ready, with a 10m timeout to cover
the CNPG bootstrap + pip init containers.

Also set postgres.embedded.instances=1 in the CI test values; the
default 3-instance cluster is unnecessary for smoke testing and adds
significant startup latency on single-node k3s clusters.
---
 .github/workflows/mlflow-ci.yml                      | 12 +++++-------
 .../mlflow/tests/helm/nodeport-ingress-disabled.yaml |  5 +++++
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml
index 5824a2a8..611856d7 100644
--- a/.github/workflows/mlflow-ci.yml
+++ b/.github/workflows/mlflow-ci.yml
@@ -436,15 +436,13 @@ jobs:
             exit 1
           fi
 
-          echo "Waiting for MLflow deployment to be available..."
-          KUBECONFIG="$KUBECONFIG_FILE" kubectl wait deployment \
-            --selector=app.kubernetes.io/name=mlflow \
-            --for=condition=Available \
-            --timeout=5m \
+          echo "Waiting for MLflow rollout to complete..."
+          KUBECONFIG="$KUBECONFIG_FILE" kubectl rollout status deployment/mlflow \
+            --timeout=10m \
             -n default || {
-            echo "ERROR: MLflow deployment not available after 5m"
+            echo "ERROR: MLflow deployment did not complete rollout within 10m"
             KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -n default || true
-            KUBECONFIG="$KUBECONFIG_FILE" kubectl describe deployment -l app.kubernetes.io/name=mlflow -n default || true
+            KUBECONFIG="$KUBECONFIG_FILE" kubectl describe deployment/mlflow -n default || true
             exit 1
           }
 
diff --git a/applications/mlflow/tests/helm/nodeport-ingress-disabled.yaml b/applications/mlflow/tests/helm/nodeport-ingress-disabled.yaml
index c045bb8d..45ead8e4 100644
--- a/applications/mlflow/tests/helm/nodeport-ingress-disabled.yaml
+++ b/applications/mlflow/tests/helm/nodeport-ingress-disabled.yaml
@@ -18,3 +18,8 @@ mlflow:
   # Environment configuration
   # env:
   #   container: []
+
+postgres:
+  embedded:
+    # Single instance is sufficient for CI; reduces startup time on 1-node clusters
+    instances: 1

From f09c59a2011ec43884022e8c0f3f4c3fad275dab Mon Sep 17 00:00:00 2001
From: ada mancini <ada@replicated.com>
Date: Fri, 27 Mar 2026 11:15:06 -0400
Subject: [PATCH 4/4] ci(mlflow): increase port-forward readiness poll to cover
 DB migrations

MLflow runs alembic DB migrations on first start before binding to
port 5000. With no readiness probe, kubectl rollout status completes
as soon as the container process starts, not when the server is
actually listening. The 30s curl loop (10x3s) was too short to cover
the full migration sequence on GKE.

Increase to 10s initial wait + 30 attempts x 5s = up to 160s total,
which comfortably covers the migration runtime.
---
 .github/workflows/mlflow-ci.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml
index 611856d7..993e7189 100644
--- a/.github/workflows/mlflow-ci.yml
+++ b/.github/workflows/mlflow-ci.yml
@@ -452,14 +452,15 @@ jobs:
           echo "hostname=localhost:$PORT" >> $GITHUB_OUTPUT
 
           echo "Waiting for port-forward to become ready..."
+          sleep 10
           success=false
-          for i in $(seq 1 10); do
+          for i in $(seq 1 30); do
             if curl -sf http://localhost:$PORT/health >/dev/null 2>&1; then
               echo "MLflow is reachable"
               success=true
               break
             fi
-            sleep 3
+            sleep 5
           done
 
           if [ "$success" != "true" ]; then