From 7413b4cc1bd381cbfa62a2d0eb4ccc86767b05d5 Mon Sep 17 00:00:00 2001 From: ada mancini Date: Wed, 25 Mar 2026 16:22:18 -0400 Subject: [PATCH 1/4] ci(mlflow): speed up k3s CI jobs - Use 1-node k3s clusters instead of 3-node; GKE keeps 3 nodes - Remove invalid skip-preflights and debug inputs from kots-install action - Replace 5-minute service poll loop with kubectl wait --for=condition=Available --- .github/workflows/mlflow-ci.yml | 82 +++++++++++---------------------- 1 file changed, 27 insertions(+), 55 deletions(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index 39a54cf6..13717fe6 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -166,8 +166,10 @@ jobs: cluster: - distribution: k3s version: 1.32 + nodes: 1 - distribution: gke version: 1.32 + nodes: 3 config: - name: nodeport-ingress-disabled values_file: tests/helm/nodeport-ingress-disabled.yaml @@ -229,7 +231,7 @@ jobs: kubernetes-version: ${{ matrix.cluster.version }} cluster-name: mlflow-ci-${{ github.run_id }}-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}-${{ matrix.config.name }} disk: 100 - nodes: 3 + nodes: ${{ matrix.cluster.nodes }} ttl: 1h export-kubeconfig: true @@ -303,8 +305,10 @@ jobs: cluster: - distribution: k3s version: 1.32 + nodes: 1 - distribution: gke version: 1.32 + nodes: 3 steps: - name: Checkout uses: actions/checkout@v4 @@ -340,7 +344,7 @@ jobs: kubernetes-version: ${{ matrix.cluster.version }} cluster-name: mlflow-kots-${{ github.run_id }}-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }} disk: 100 - nodes: 3 + nodes: ${{ matrix.cluster.nodes }} ttl: 1h export-kubeconfig: true @@ -407,8 +411,6 @@ jobs: namespace: default wait-duration: 10m shared-password: 'replicatedmlflow' - skip-preflights: true - debug: true # Set up port forwarding after KOTS installation is complete - name: Set up port forwarding @@ -416,63 +418,33 @@ jobs: run: | KUBECONFIG_FILE="/tmp/kubeconfig-kots-test-${{ github.run_id }}" echo "$KUBECONFIG" > "$KUBECONFIG_FILE" - echo "Saved kubeconfig to $KUBECONFIG_FILE" PORT="5000" - echo "Using port: $PORT for testing" - echo "Waiting for MLflow service to be created..." - MAX_RETRIES=30 - RETRY_INTERVAL=10 - RETRY_COUNT=0 - SERVICE_FOUND=false - while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do - echo "Check $((RETRY_COUNT+1))/$MAX_RETRIES: Looking for MLflow service..." - if KUBECONFIG="$KUBECONFIG_FILE" kubectl get svc mlflow -n default --no-headers 2>/dev/null; then - echo "MLflow service found!" - SERVICE_FOUND=true - break - else - echo "MLflow service not found yet. Waiting $RETRY_INTERVAL seconds..." - RETRY_COUNT=$((RETRY_COUNT+1)) - sleep $RETRY_INTERVAL - fi - done - if [ "$SERVICE_FOUND" != "true" ]; then - echo "ERROR: MLflow service not found after $((MAX_RETRIES * RETRY_INTERVAL)) seconds." - echo "Showing all available services in the namespace:" - KUBECONFIG="$KUBECONFIG_FILE" kubectl get svc -n default - echo "Showing KOTS application status:" - KUBECONFIG="$KUBECONFIG_FILE" kubectl get app -n default || true - echo "Showing all pods in the namespace:" + + echo "Waiting for MLflow deployment to be available..." + KUBECONFIG="$KUBECONFIG_FILE" kubectl wait deployment \ + --selector=app.kubernetes.io/name=mlflow \ + --for=condition=Available \ + --timeout=5m \ + -n default || { + echo "ERROR: MLflow deployment not available after 5m" KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -n default + KUBECONFIG="$KUBECONFIG_FILE" kubectl describe deployment -l app.kubernetes.io/name=mlflow -n default exit 1 - fi - KUBECONFIG="$KUBECONFIG_FILE" kubectl get svc -n default - echo "Checking pod status..." - KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -n default - echo "Waiting for MLflow pods to be running..." - KUBECONFIG="$KUBECONFIG_FILE" kubectl wait --for=condition=Ready pods --selector=app.kubernetes.io/name=mlflow -n default --timeout=2m || { - echo "WARNING: Timed out waiting for pods to be ready, will try port-forwarding anyway" - KUBECONFIG="$KUBECONFIG_FILE" kubectl describe pods -n default } - echo "Setting up port forwarding to run in the background" + + echo "Setting up port forwarding..." nohup bash -c "KUBECONFIG='$KUBECONFIG_FILE' kubectl port-forward -n default svc/mlflow $PORT:5000 &>/tmp/port-forward-kots-${{ github.run_id }}.log" & - PORT_FORWARD_PID=$! - echo "port_forward_pid=$PORT_FORWARD_PID" >> $GITHUB_OUTPUT - echo "Set up port forwarding with PID: $PORT_FORWARD_PID" + echo "port_forward_pid=$!" >> $GITHUB_OUTPUT echo "hostname=localhost:$PORT" >> $GITHUB_OUTPUT - echo "Test endpoint will be: localhost:$PORT" - echo "Waiting for port-forward to establish..." - sleep 15 - echo "Checking connectivity to MLflow..." - if curl -s -o /dev/null -w "%{http_code}" http://localhost:$PORT/; then - echo "Successfully connected to MLflow service!" - else - echo "Warning: Initial connection attempt failed, service may still be starting" - echo "Port-forward log:" - cat /tmp/port-forward-kots-${{ github.run_id }}.log || true - echo "Pod logs:" - KUBECONFIG="$KUBECONFIG_FILE" kubectl logs -n default -l app.kubernetes.io/name=mlflow --tail=20 || true - fi + + echo "Waiting for port-forward to become ready..." + for i in $(seq 1 10); do + if curl -sf http://localhost:$PORT/health >/dev/null 2>&1; then + echo "MLflow is reachable" + break + fi + sleep 3 + done env: KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }} From 50f5df7ae8a189e2521c9c524a3bc93236e0b4e3 Mon Sep 17 00:00:00 2001 From: ada mancini Date: Wed, 25 Mar 2026 16:45:46 -0400 Subject: [PATCH 2/4] ci(mlflow): apply Copilot review suggestions - Poll for deployment existence before kubectl wait (avoids immediate failure if KOTS hasn't created the Deployment yet) - Add || true to kubectl diagnostics in error handler (prevents bash -e from swallowing the original error when describe returns non-zero) - Fail port-forward step explicitly if MLflow never becomes reachable, with pod logs and port-forward log printed before exiting non-zero --- .github/workflows/mlflow-ci.yml | 36 +++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index 13717fe6..5824a2a8 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -420,6 +420,22 @@ jobs: echo "$KUBECONFIG" > "$KUBECONFIG_FILE" PORT="5000" + echo "Waiting for MLflow deployment to be created..." + for i in $(seq 1 30); do + if KUBECONFIG="$KUBECONFIG_FILE" kubectl get deployment -l app.kubernetes.io/name=mlflow -n default -o name 2>/dev/null | grep -q .; then + echo "MLflow deployment found." + break + fi + echo "MLflow deployment not found yet (attempt $i/30), waiting..." + sleep 10 + done + + if ! KUBECONFIG="$KUBECONFIG_FILE" kubectl get deployment -l app.kubernetes.io/name=mlflow -n default -o name 2>/dev/null | grep -q .; then + echo "ERROR: MLflow deployment was not created within the expected time" + KUBECONFIG="$KUBECONFIG_FILE" kubectl get deployments,pods -n default || true + exit 1 + fi + echo "Waiting for MLflow deployment to be available..." KUBECONFIG="$KUBECONFIG_FILE" kubectl wait deployment \ --selector=app.kubernetes.io/name=mlflow \ @@ -427,8 +443,8 @@ jobs: --timeout=5m \ -n default || { echo "ERROR: MLflow deployment not available after 5m" - KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -n default - KUBECONFIG="$KUBECONFIG_FILE" kubectl describe deployment -l app.kubernetes.io/name=mlflow -n default + KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -n default || true + KUBECONFIG="$KUBECONFIG_FILE" kubectl describe deployment -l app.kubernetes.io/name=mlflow -n default || true exit 1 } @@ -438,13 +454,29 @@ jobs: echo "hostname=localhost:$PORT" >> $GITHUB_OUTPUT echo "Waiting for port-forward to become ready..." + success=false for i in $(seq 1 10); do if curl -sf http://localhost:$PORT/health >/dev/null 2>&1; then echo "MLflow is reachable" + success=true break fi sleep 3 done + + if [ "$success" != "true" ]; then + echo "ERROR: MLflow never became reachable via port-forward after 30s" + echo "==== Port-forward log ====" + cat "/tmp/port-forward-kots-${{ github.run_id }}.log" || true + echo "==== MLflow pods ====" + KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -l app.kubernetes.io/name=mlflow -n default || true + pod_name="$(KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -l app.kubernetes.io/name=mlflow -n default -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")" + if [ -n "$pod_name" ]; then + echo "==== Logs for pod $pod_name ====" + KUBECONFIG="$KUBECONFIG_FILE" kubectl logs "$pod_name" -n default || true + fi + exit 1 + fi env: KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }} From 252afeb2383491d0c0320ddf8326948d10490d27 Mon Sep 17 00:00:00 2001 From: ada mancini Date: Thu, 26 Mar 2026 17:49:55 -0400 Subject: [PATCH 3/4] ci(mlflow): fix KOTS port-forward race and reduce postgres instances kubectl wait --for=condition=Available returns immediately for a Deployment with maxUnavailable=1 and replicas=1 because 0 ready pods satisfies the condition (0 >= 1-1). The pod was still Init:0/2 when port-forward ran. Replace with kubectl rollout status which correctly waits until all desired replicas are Running and Ready, with a 10m timeout to cover the CNPG bootstrap + pip init containers. Also set postgres.embedded.instances=1 in the CI test values; the default 3-instance cluster is unnecessary for smoke testing and adds significant startup latency on single-node k3s clusters. --- .github/workflows/mlflow-ci.yml | 12 +++++------- .../mlflow/tests/helm/nodeport-ingress-disabled.yaml | 5 +++++ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index 5824a2a8..611856d7 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -436,15 +436,13 @@ jobs: exit 1 fi - echo "Waiting for MLflow deployment to be available..." - KUBECONFIG="$KUBECONFIG_FILE" kubectl wait deployment \ - --selector=app.kubernetes.io/name=mlflow \ - --for=condition=Available \ - --timeout=5m \ + echo "Waiting for MLflow rollout to complete..." + KUBECONFIG="$KUBECONFIG_FILE" kubectl rollout status deployment/mlflow \ + --timeout=10m \ -n default || { - echo "ERROR: MLflow deployment not available after 5m" + echo "ERROR: MLflow deployment did not complete rollout within 10m" KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -n default || true - KUBECONFIG="$KUBECONFIG_FILE" kubectl describe deployment -l app.kubernetes.io/name=mlflow -n default || true + KUBECONFIG="$KUBECONFIG_FILE" kubectl describe deployment/mlflow -n default || true exit 1 } diff --git a/applications/mlflow/tests/helm/nodeport-ingress-disabled.yaml b/applications/mlflow/tests/helm/nodeport-ingress-disabled.yaml index c045bb8d..45ead8e4 100644 --- a/applications/mlflow/tests/helm/nodeport-ingress-disabled.yaml +++ b/applications/mlflow/tests/helm/nodeport-ingress-disabled.yaml @@ -18,3 +18,8 @@ mlflow: # Environment configuration # env: # container: [] + +postgres: + embedded: + # Single instance is sufficient for CI; reduces startup time on 1-node clusters + instances: 1 From f09c59a2011ec43884022e8c0f3f4c3fad275dab Mon Sep 17 00:00:00 2001 From: ada mancini Date: Fri, 27 Mar 2026 11:15:06 -0400 Subject: [PATCH 4/4] ci(mlflow): increase port-forward readiness poll to cover DB migrations MLflow runs alembic DB migrations on first start before binding to port 5000. With no readiness probe, kubectl rollout status completes as soon as the container process starts, not when the server is actually listening. The 30s curl loop (10x3s) was too short to cover the full migration sequence on GKE. Increase to 10s initial wait + 30 attempts x 5s = up to 160s total, which comfortably covers the migration runtime. --- .github/workflows/mlflow-ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mlflow-ci.yml b/.github/workflows/mlflow-ci.yml index 611856d7..993e7189 100644 --- a/.github/workflows/mlflow-ci.yml +++ b/.github/workflows/mlflow-ci.yml @@ -452,14 +452,15 @@ jobs: echo "hostname=localhost:$PORT" >> $GITHUB_OUTPUT echo "Waiting for port-forward to become ready..." + sleep 10 success=false - for i in $(seq 1 10); do + for i in $(seq 1 30); do if curl -sf http://localhost:$PORT/health >/dev/null 2>&1; then echo "MLflow is reachable" success=true break fi - sleep 3 + sleep 5 done if [ "$success" != "true" ]; then