From 1d459f605650c192a77e3e81bcbcfbdf6ab012da Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Thu, 21 May 2026 21:25:05 +0100 Subject: [PATCH 1/9] Add tests for MCPServer generation freeze invariant Lock-in test in pkg/container/kubernetes/client_test.go documenting the gate's blind spot when two callers pass equal MCPServerGeneration with different images. TDD test in cmd/thv-proxyrunner/app/run_test.go for the env-var override (THV_MCPSERVER_GENERATION) that the fix in #5360 will introduce. Fails today; goes green when the override is implemented. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/thv-proxyrunner/app/run_test.go | 64 +++++++++++++++++++++++++ pkg/container/kubernetes/client_test.go | 64 +++++++++++++++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 cmd/thv-proxyrunner/app/run_test.go diff --git a/cmd/thv-proxyrunner/app/run_test.go b/cmd/thv-proxyrunner/app/run_test.go new file mode 100644 index 0000000000..18210779b2 --- /dev/null +++ b/cmd/thv-proxyrunner/app/run_test.go @@ -0,0 +1,64 @@ +// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package app + +import ( + "encoding/json" + "os" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/stacklok/toolhive/pkg/runner" +) + +// TestTryLoadConfigFromFile_MCPServerGenerationEnvOverride asserts that when +// THV_MCPSERVER_GENERATION is set in the environment, the proxyrunner's loaded +// RunConfig carries that value rather than the one from runconfig.json. +// +// Issue #5360: the /etc/runconfig ConfigMap volume is mounted live (no +// subPath), so a restarted old-RS proxyrunner pod re-reads the file after a +// helm upgrade and picks up the new MCPServer.metadata.generation. Both old +// and new pods then call DeployWorkload with the same ourGen, defeating the +// strict-greater-than gate at pkg/container/kubernetes/client.go:530. +// +// The fix sources MCPServerGeneration from an env var injected via the +// downward API (frozen at pod creation, parallel to how the image is frozen +// via the CLI positional arg). This test exercises that override and fails +// today because no such override exists in the config-loading path. +func TestTryLoadConfigFromFile_MCPServerGenerationEnvOverride(t *testing.T) { + // Skip when system-wide runconfig paths exist; tryLoadConfigFromFile + // checks them ahead of ./runconfig.json. Avoids false positives on + // machines where toolhive runtime data happens to live in /etc. + for _, p := range []string{kubernetesRunConfigPath, systemRunConfigPath} { + if _, err := os.Stat(p); err == nil { + t.Skipf("skipping: %s exists and would shadow the test fixture", p) + } + } + + dir := t.TempDir() + t.Chdir(dir) + + cfg := &runner.RunConfig{ + SchemaVersion: runner.CurrentSchemaVersion, + MCPServerGeneration: 5, + } + data, err := json.Marshal(cfg) + require.NoError(t, err) + require.NoError(t, os.WriteFile("runconfig.json", data, 0o600)) + + t.Setenv("THV_MCPSERVER_GENERATION", "3") + + loaded, err := tryLoadConfigFromFile() + require.NoError(t, err) + require.NotNil(t, loaded) + + assert.Equal(t, int64(3), loaded.MCPServerGeneration, + "THV_MCPSERVER_GENERATION must override the file value; without "+ + "a frozen-per-pod source for MCPServerGeneration the apply-gate "+ + "at pkg/container/kubernetes/client.go:530 cannot distinguish "+ + "two proxyrunner pods that have both re-read the live-mounted "+ + "ConfigMap (issue #5360)") +} diff --git a/pkg/container/kubernetes/client_test.go b/pkg/container/kubernetes/client_test.go index 8f836a524c..d960b13c0d 100644 --- a/pkg/container/kubernetes/client_test.go +++ b/pkg/container/kubernetes/client_test.go @@ -1645,3 +1645,67 @@ func TestDeployWorkload_RunConfigMCPServerGenerationGate(t *testing.T) { }) } } + +// TestDeployWorkload_EqualGenerationDifferentImageClobbers documents the +// regression mode for issue #5360. Two proxyrunner pods coexist during a +// rolling update; both have read the same MCPServerGeneration N from the +// live-mounted RunConfig ConfigMap, but each holds a different image in +// its CLI positional arg (frozen at pod creation). The gate at +// shouldSkipStatefulSetApply uses strict-greater-than, so equal +// generations cannot distinguish the callers — the stale-image apply +// lands successfully and clobbers the fresh-image apply. +// +// The fix is upstream of this layer: freeze MCPServerGeneration per pod +// (downward-API env var) so the two callers carry different ourGen +// values and the gate fires correctly. This test pins down what the gate +// cannot defend against alone; it continues to pass after the fix +// because the production scenario it models can no longer occur. +func TestDeployWorkload_EqualGenerationDifferentImageClobbers(t *testing.T) { + t.Parallel() + + const containerName = "test-container" + const gen = int64(100) + const freshImage = "fresh-image:new" + const staleImage = "stale-image:old" + + clientset := fake.NewClientset() + client := NewClientWithConfigAndPlatformDetector( + clientset, + &rest.Config{Host: "https://fake-k8s-api.example.com"}, + &mockPlatformDetector{platform: PlatformKubernetes}, + ) + client.waitForStatefulSetReadyFunc = mockWaitForStatefulSetReady + client.namespaceFunc = func() string { return defaultNamespace } + + options := runtime.NewDeployWorkloadOptions() + options.RunConfigMCPServerGeneration = gen + + // Fresh-image proxyrunner pod applies first. + _, err := client.DeployWorkload( + t.Context(), + freshImage, containerName, nil, + map[string]string{}, map[string]string{}, + nil, "streamable-http", options, false, + ) + require.NoError(t, err) + + // Stale-image proxyrunner pod (restarted old-RS pod that re-read the + // live ConfigMap) applies second with the SAME generation. + _, err = client.DeployWorkload( + t.Context(), + staleImage, containerName, nil, + map[string]string{}, map[string]string{}, + nil, "streamable-http", options, false, + ) + require.NoError(t, err) + + sts, err := clientset.AppsV1().StatefulSets(defaultNamespace).Get( + t.Context(), containerName, metav1.GetOptions{}, + ) + require.NoError(t, err) + require.NotEmpty(t, sts.Spec.Template.Spec.Containers) + assert.Equal(t, staleImage, sts.Spec.Template.Spec.Containers[0].Image, + "stale apply must clobber the fresh image when generations are equal; "+ + "the gate cannot defend against this and the fix must live upstream "+ + "(see issue #5360)") +} From 1037d09a90c6a54cd2dd7fe9097258a4bf0daacf Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Thu, 21 May 2026 21:27:28 +0100 Subject: [PATCH 2/9] Freeze proxyrunner MCPServerGeneration via env var The /etc/runconfig ConfigMap volume mounts live (no subPath), so a restarted old-RS proxyrunner pod re-reads runconfig.json after a helm upgrade and picks up the new MCPServer generation. Both old and new pods then call DeployWorkload with the same ourGen, defeating the strict-greater-than gate at shouldSkipStatefulSetApply. Honor THV_MCPSERVER_GENERATION (sourced via downward API in a follow-up operator change) as an override on the file value, freezing the generation per pod at creation time. Parallel to how the image is already frozen via the CLI positional arg. Part of #5360. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/thv-proxyrunner/app/run.go | 26 ++++++++++++++++++++++++++ cmd/thv-proxyrunner/app/run_test.go | 3 ++- pkg/container/kubernetes/client.go | 15 +++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/cmd/thv-proxyrunner/app/run.go b/cmd/thv-proxyrunner/app/run.go index 32a8695fd5..3a103d060c 100644 --- a/cmd/thv-proxyrunner/app/run.go +++ b/cmd/thv-proxyrunner/app/run.go @@ -8,12 +8,14 @@ import ( "fmt" "log/slog" "os" + "strconv" "github.com/spf13/cobra" "github.com/spf13/viper" regtypes "github.com/stacklok/toolhive-core/registry/types" "github.com/stacklok/toolhive/pkg/container" + "github.com/stacklok/toolhive/pkg/container/kubernetes" "github.com/stacklok/toolhive/pkg/container/runtime" "github.com/stacklok/toolhive/pkg/runner" "github.com/stacklok/toolhive/pkg/workloads/statuses" @@ -151,6 +153,8 @@ func tryLoadConfigFromFile() (*runner.RunConfig, error) { return nil, fmt.Errorf("found config file at %s but failed to parse JSON: %w", path, err) } + applyMCPServerGenerationOverride(runConfig) + slog.Info(fmt.Sprintf("Successfully loaded configuration from %s", path)) return runConfig, nil } @@ -159,6 +163,28 @@ func tryLoadConfigFromFile() (*runner.RunConfig, error) { return nil, fmt.Errorf("configuration file required but no configuration file was found") } +// applyMCPServerGenerationOverride replaces runConfig.MCPServerGeneration with +// the value of the EnvVarMCPServerGeneration environment variable when set. +// The operator projects this env var via the downward API from the pod's +// mcpserver-generation annotation, freezing the value per pod at creation +// time. Without this override the value would come from /etc/runconfig +// (a live-updating ConfigMap volume), letting two coexisting proxyrunner +// pods converge on the same generation during a rolling update and defeat +// the apply-gate at shouldSkipStatefulSetApply. See issue #5360. +func applyMCPServerGenerationOverride(runConfig *runner.RunConfig) { + raw := os.Getenv(kubernetes.EnvVarMCPServerGeneration) + if raw == "" { + return + } + gen, err := strconv.ParseInt(raw, 10, 64) + if err != nil { + slog.Warn("ignoring unparseable env var; falling back to runconfig value", + "env", kubernetes.EnvVarMCPServerGeneration, "value", raw, "err", err) + return + } + runConfig.MCPServerGeneration = gen +} + // runWithFileBasedConfig handles execution when a runconfig.json file is found. // Uses config from file exactly as-is, ignoring all CLI configuration flags. // Only uses essential non-configuration inputs: image, command args, and --k8s-pod-patch. diff --git a/cmd/thv-proxyrunner/app/run_test.go b/cmd/thv-proxyrunner/app/run_test.go index 18210779b2..bead187b64 100644 --- a/cmd/thv-proxyrunner/app/run_test.go +++ b/cmd/thv-proxyrunner/app/run_test.go @@ -11,6 +11,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/stacklok/toolhive/pkg/container/kubernetes" "github.com/stacklok/toolhive/pkg/runner" ) @@ -49,7 +50,7 @@ func TestTryLoadConfigFromFile_MCPServerGenerationEnvOverride(t *testing.T) { require.NoError(t, err) require.NoError(t, os.WriteFile("runconfig.json", data, 0o600)) - t.Setenv("THV_MCPSERVER_GENERATION", "3") + t.Setenv(kubernetes.EnvVarMCPServerGeneration, "3") loaded, err := tryLoadConfigFromFile() require.NoError(t, err) diff --git a/pkg/container/kubernetes/client.go b/pkg/container/kubernetes/client.go index 7bb0a4d049..0b6d72cd09 100644 --- a/pkg/container/kubernetes/client.go +++ b/pkg/container/kubernetes/client.go @@ -58,7 +58,22 @@ const ( // to a version that reads this annotation; operator-only upgrades leave the race window // in place until proxyrunner is also rolled. Exported because it forms a wire contract // that external readers (operator, diagnostic tooling) may consume. + // + // The operator also stamps this same annotation on the proxyrunner Deployment's + // pod template and projects it into the proxyrunner container as the env var + // EnvVarMCPServerGeneration via the downward API. That projection freezes the + // generation per pod at creation time, so two coexisting proxyrunner pods cannot + // converge on the same generation by re-reading the live-mounted RunConfig + // ConfigMap (issue #5360). RunConfigMCPServerGenerationAnnotation = "toolhive.stacklok.dev/mcpserver-generation" + + // EnvVarMCPServerGeneration is the env var name through which the proxyrunner + // container receives its frozen-per-pod MCPServer generation. Sourced via the + // downward API from the pod-template annotation RunConfigMCPServerGenerationAnnotation, + // it overrides the value read from /etc/runconfig/runconfig.json (which would + // otherwise live-update across all proxyrunner pods during a helm upgrade and + // defeat the apply-gate). See issue #5360. + EnvVarMCPServerGeneration = "THV_MCPSERVER_GENERATION" ) // RuntimeName is the name identifier for the Kubernetes runtime From b30e66405f14811fefd02c4c77f205dc31bcad81 Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Thu, 21 May 2026 21:46:14 +0100 Subject: [PATCH 3/9] Project MCPServer generation into proxyrunner via downward API Stamp the MCPServer generation as a pod-template annotation on the proxy Deployment, and project that annotation into the proxyrunner container as the THV_MCPSERVER_GENERATION env var via the downward API. The env var is bound to the pod's own annotations at creation time, so a restarted old-RS pod cannot acquire the new generation by re-reading the live-mounted RunConfig ConfigMap. The proxyrunner already honors this env var as an override on the file value (see prior commit). With both sides wired, two coexisting proxyrunner pods during a rolling update carry distinct generations again and the strict-greater-than gate at shouldSkipStatefulSetApply fires correctly. Mirror the new env var and annotation in deploymentNeedsUpdate so drift detection stays stable. Closes #5360. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../controllers/mcpserver_controller.go | 41 ++++++ .../mcpserver_resource_overrides_test.go | 121 ++++++++++++++---- 2 files changed, 135 insertions(+), 27 deletions(-) diff --git a/cmd/thv-operator/controllers/mcpserver_controller.go b/cmd/thv-operator/controllers/mcpserver_controller.go index eec776c05e..8b8950fb7d 100644 --- a/cmd/thv-operator/controllers/mcpserver_controller.go +++ b/cmd/thv-operator/controllers/mcpserver_controller.go @@ -11,6 +11,7 @@ import ( "fmt" "maps" "os" + "strconv" "strings" "time" @@ -1151,6 +1152,20 @@ func (r *MCPServerReconciler) deploymentForMCPServer( // name collision (ResourceOverrides.Env only accepts plain strings). env = append(env, r.buildRedisPasswordEnvVar(m)...) + // Project the MCPServer generation pod-template annotation into the + // proxyrunner container via the downward API. The proxyrunner uses this + // to override the value read from the live-mounted RunConfig ConfigMap, + // freezing it per pod at creation time. See #5360. + env = append(env, corev1.EnvVar{ + Name: kubernetes.EnvVarMCPServerGeneration, + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: fmt.Sprintf("metadata.annotations['%s']", + kubernetes.RunConfigMCPServerGenerationAnnotation), + }, + }, + }) + // Add volume mounts for user-defined volumes for _, v := range m.Spec.Volumes { volumeMounts = append(volumeMounts, corev1.VolumeMount{ @@ -1264,6 +1279,12 @@ func (r *MCPServerReconciler) deploymentForMCPServer( // Add RunConfig checksum annotation to trigger pod rollout when config changes deploymentTemplateAnnotations = checksum.AddRunConfigChecksumToPodTemplate(deploymentTemplateAnnotations, runConfigChecksum) + // Stamp the MCPServer generation on the proxy Deployment's pod template so the + // downward-API env var below resolves to a value that is frozen at pod creation + // time, not live-updated like the runconfig.json ConfigMap mount. See #5360. + deploymentTemplateAnnotations[kubernetes.RunConfigMCPServerGenerationAnnotation] = + strconv.FormatInt(m.Generation, 10) + if m.Spec.ResourceOverrides != nil && m.Spec.ResourceOverrides.ProxyDeployment != nil { if m.Spec.ResourceOverrides.ProxyDeployment.Labels != nil { deploymentLabels = ctrlutil.MergeLabels(ls, m.Spec.ResourceOverrides.ProxyDeployment.Labels) @@ -1759,6 +1780,21 @@ func (r *MCPServerReconciler) deploymentNeedsUpdate( } } + // Project the MCPServer generation pod-template annotation into the + // proxyrunner container via the downward API. Position mirrors the + // construction site in deploymentForMCPServer (after Redis password, + // before embedded auth) so DeepEqual against container.Env succeeds. + // See #5360. + expectedProxyEnv = append(expectedProxyEnv, corev1.EnvVar{ + Name: kubernetes.EnvVarMCPServerGeneration, + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: fmt.Sprintf("metadata.annotations['%s']", + kubernetes.RunConfigMCPServerGenerationAnnotation), + }, + }, + }) + // Add embedded auth server environment variables. AuthServerRef takes precedence; // externalAuthConfigRef is used as a fallback (legacy path). if configName := ctrlutil.EmbeddedAuthServerConfigName( @@ -1879,6 +1915,11 @@ func (r *MCPServerReconciler) deploymentNeedsUpdate( // Check if pod template annotations have changed (including runconfig checksum) expectedPodTemplateAnnotations := make(map[string]string) expectedPodTemplateAnnotations = checksum.AddRunConfigChecksumToPodTemplate(expectedPodTemplateAnnotations, runConfigChecksum) + // Mirrors deploymentForMCPServer: stamp the MCPServer generation so the + // downward-API env var injected into the proxyrunner container resolves + // to a frozen-per-pod value (#5360). + expectedPodTemplateAnnotations[kubernetes.RunConfigMCPServerGenerationAnnotation] = + strconv.FormatInt(mcpServer.Generation, 10) if mcpServer.Spec.ResourceOverrides != nil && mcpServer.Spec.ResourceOverrides.ProxyDeployment != nil && diff --git a/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go b/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go index 93a0fc9cce..671d3c90a6 100644 --- a/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go +++ b/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go @@ -252,7 +252,8 @@ func TestResourceOverrides(t *testing.T) { }, expectedDeploymentAnns: map[string]string{}, expectedPodTemplateAnns: map[string]string{ - "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + "toolhive.stacklok.dev/mcpserver-generation": "0", }, expectedServiceLabels: map[string]string{ "app": "mcpserver", @@ -315,7 +316,8 @@ func TestResourceOverrides(t *testing.T) { "monitoring/scrape": "true", }, expectedPodTemplateAnns: map[string]string{ - "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + "toolhive.stacklok.dev/mcpserver-generation": "0", }, expectedServiceLabels: map[string]string{ "app": "mcpserver", @@ -376,7 +378,8 @@ func TestResourceOverrides(t *testing.T) { }, expectedDeploymentAnns: map[string]string{}, expectedPodTemplateAnns: map[string]string{ - "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + "toolhive.stacklok.dev/mcpserver-generation": "0", }, expectedServiceLabels: map[string]string{ "app": "mcpserver", @@ -415,7 +418,8 @@ func TestResourceOverrides(t *testing.T) { }, expectedDeploymentAnns: map[string]string{}, expectedPodTemplateAnns: map[string]string{ - "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + "toolhive.stacklok.dev/mcpserver-generation": "0", }, expectedServiceLabels: map[string]string{ "app": "mcpserver", @@ -481,7 +485,8 @@ func TestResourceOverrides(t *testing.T) { "version": "v1.2.3", }, expectedPodTemplateAnns: map[string]string{ - "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + "toolhive.stacklok.dev/mcpserver-generation": "0", }, expectedServiceLabels: map[string]string{ "app": "mcpserver", @@ -525,9 +530,10 @@ func TestResourceOverrides(t *testing.T) { }, expectedDeploymentAnns: map[string]string{}, expectedPodTemplateAnns: map[string]string{ - "vault.hashicorp.com/agent-inject": "true", - "vault.hashicorp.com/role": "toolhive-mcp-workloads", - "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + "vault.hashicorp.com/agent-inject": "true", + "vault.hashicorp.com/role": "toolhive-mcp-workloads", + "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + "toolhive.stacklok.dev/mcpserver-generation": "0", }, expectedServiceLabels: map[string]string{ "app": "mcpserver", @@ -582,30 +588,33 @@ func TestResourceOverrides(t *testing.T) { switch tt.name { case "with proxy environment variables": expectedEnvVars = map[string]string{ - "HTTP_PROXY": "http://proxy.example.com:8080", - "NO_PROXY": "localhost,127.0.0.1", - "CUSTOM_ENV": "custom-value", - "XDG_CONFIG_HOME": "/tmp", - "HOME": "/tmp", - "TOOLHIVE_RUNTIME": "kubernetes", - "UNSTRUCTURED_LOGS": "false", + "HTTP_PROXY": "http://proxy.example.com:8080", + "NO_PROXY": "localhost,127.0.0.1", + "CUSTOM_ENV": "custom-value", + "THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set + "XDG_CONFIG_HOME": "/tmp", + "HOME": "/tmp", + "TOOLHIVE_RUNTIME": "kubernetes", + "UNSTRUCTURED_LOGS": "false", } case "with debug logging via TOOLHIVE_DEBUG env var": expectedEnvVars = map[string]string{ - "TOOLHIVE_DEBUG": "true", - "XDG_CONFIG_HOME": "/tmp", - "HOME": "/tmp", - "TOOLHIVE_RUNTIME": "kubernetes", - "UNSTRUCTURED_LOGS": "false", + "TOOLHIVE_DEBUG": "true", + "THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set + "XDG_CONFIG_HOME": "/tmp", + "HOME": "/tmp", + "TOOLHIVE_RUNTIME": "kubernetes", + "UNSTRUCTURED_LOGS": "false", } default: expectedEnvVars = map[string]string{ - "LOG_LEVEL": "debug", - "METRICS_ENABLED": "true", - "XDG_CONFIG_HOME": "/tmp", - "HOME": "/tmp", - "TOOLHIVE_RUNTIME": "kubernetes", - "UNSTRUCTURED_LOGS": "false", + "LOG_LEVEL": "debug", + "METRICS_ENABLED": "true", + "THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set + "XDG_CONFIG_HOME": "/tmp", + "HOME": "/tmp", + "TOOLHIVE_RUNTIME": "kubernetes", + "UNSTRUCTURED_LOGS": "false", } } @@ -657,7 +666,10 @@ func TestDeploymentForMCPServer_PodTemplateOverridesPreserveRunConfigChecksum(t assert.Equal(t, "value", deployment.Spec.Template.Annotations["user.example.com/some-key"], "user override must survive") - assert.Len(t, deployment.Spec.Template.Annotations, 2, + assert.Contains(t, deployment.Spec.Template.Annotations, + kubernetes.RunConfigMCPServerGenerationAnnotation, + "mcpserver-generation must be stamped for the downward-API env var (#5360)") + assert.Len(t, deployment.Spec.Template.Annotations, 3, "no extra keys should leak into the pod template") } @@ -1140,3 +1152,58 @@ func TestMCPServerServiceNeedsUpdate(t *testing.T) { }) } } + +// TestDeploymentForMCPServer_MCPServerGenerationDownwardAPI verifies that the +// proxy Deployment stamps the MCPServer generation as a pod-template annotation +// AND projects that annotation into the proxyrunner container as the +// THV_MCPSERVER_GENERATION env var via the downward API. This is the +// frozen-per-pod path that closes the race described in #5360 — the env var's +// value is bound to the pod's own annotations at creation time, so a restarted +// old-RS pod cannot acquire the new generation by re-reading the live-mounted +// RunConfig ConfigMap. +func TestDeploymentForMCPServer_MCPServerGenerationDownwardAPI(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + require.NoError(t, mcpv1beta1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + client := fake.NewClientBuilder().WithScheme(scheme).Build() + r := newTestMCPServerReconciler(client, scheme, kubernetes.PlatformKubernetes) + + mcpServer := &mcpv1beta1.MCPServer{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-server", + Namespace: "default", + Generation: 7, + }, + Spec: mcpv1beta1.MCPServerSpec{ + Image: "test-image", + ProxyPort: 8080, + }, + } + + deployment, err := r.deploymentForMCPServer(t.Context(), mcpServer, "test-checksum") + require.NoError(t, err) + require.NotNil(t, deployment) + + assert.Equal(t, "7", + deployment.Spec.Template.Annotations[kubernetes.RunConfigMCPServerGenerationAnnotation], + "pod template must stamp the MCPServer generation so the downward-API env var resolves") + + require.Len(t, deployment.Spec.Template.Spec.Containers, 1) + var got *corev1.EnvVar + for i := range deployment.Spec.Template.Spec.Containers[0].Env { + if deployment.Spec.Template.Spec.Containers[0].Env[i].Name == kubernetes.EnvVarMCPServerGeneration { + got = &deployment.Spec.Template.Spec.Containers[0].Env[i] + break + } + } + require.NotNil(t, got, "container must declare the %s env var", kubernetes.EnvVarMCPServerGeneration) + require.NotNil(t, got.ValueFrom, "env var must use ValueFrom (downward API), not a literal Value") + require.NotNil(t, got.ValueFrom.FieldRef) + assert.Equal(t, + "metadata.annotations['"+kubernetes.RunConfigMCPServerGenerationAnnotation+"']", + got.ValueFrom.FieldRef.FieldPath, + "FieldRef must point at the mcpserver-generation pod annotation") +} From af2960d7bcc45b3ec38c841700371f15b4a33807 Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Thu, 21 May 2026 22:07:57 +0100 Subject: [PATCH 4/9] Add envtest coverage for generation-freeze contract Two new integration specs assert the operator side of the freeze end-to-end against a real apiserver: - On initial reconcile, the proxy Deployment's pod template carries the mcpserver-generation annotation set to MCPServer.metadata.generation, and the proxyrunner container declares THV_MCPSERVER_GENERATION via a downward-API FieldRef pointing at that exact annotation key. - On a Spec.Image patch the controller bumps the annotation value to the new generation and keeps the FieldRef wired correctly. This is the rolling-update path: new pods get a strictly-greater frozen generation than any restarted old-RS pod. Catches typos in either the annotation key or the FieldRef path that would silently produce an empty env var and let the proxyrunner fall through to the (live-mounted) file value. Complements the unit-level TestDeploymentForMCPServer_MCPServerGenerationDownwardAPI and TestTryLoadConfigFromFile_MCPServerGenerationEnvOverride. Part of #5360. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...rver_generation_freeze_integration_test.go | 201 ++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 cmd/thv-operator/test-integration/mcp-server/mcpserver_generation_freeze_integration_test.go diff --git a/cmd/thv-operator/test-integration/mcp-server/mcpserver_generation_freeze_integration_test.go b/cmd/thv-operator/test-integration/mcp-server/mcpserver_generation_freeze_integration_test.go new file mode 100644 index 0000000000..49e95ca398 --- /dev/null +++ b/cmd/thv-operator/test-integration/mcp-server/mcpserver_generation_freeze_integration_test.go @@ -0,0 +1,201 @@ +// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +// Package controllers contains integration tests for the per-pod +// MCPServer-generation freeze (#5360). +// +// These tests cover the operator side of the contract — that the proxy +// Deployment's pod template carries the mcpserver-generation annotation and +// that the proxyrunner container declares the THV_MCPSERVER_GENERATION env +// var via the downward API pointing at that annotation. The proxyrunner-side +// override of the file value is covered by +// TestTryLoadConfigFromFile_MCPServerGenerationEnvOverride in +// cmd/thv-proxyrunner/app/run_test.go. +package controllers + +import ( + "strconv" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1" + "github.com/stacklok/toolhive/pkg/container/kubernetes" +) + +var _ = Describe("MCPServer generation freeze (#5360)", func() { + const ( + timeout = time.Second * 30 + interval = time.Millisecond * 250 + freezeNS = "generation-freeze-test-ns" + annotation = kubernetes.RunConfigMCPServerGenerationAnnotation + ) + + envVarName := kubernetes.EnvVarMCPServerGeneration + expectedFieldPath := "metadata.annotations['" + annotation + "']" + + // findGenerationEnvVar returns the proxyrunner container's + // THV_MCPSERVER_GENERATION env var, or nil if absent. + findGenerationEnvVar := func(deployment *appsv1.Deployment) *corev1.EnvVar { + if len(deployment.Spec.Template.Spec.Containers) == 0 { + return nil + } + for i := range deployment.Spec.Template.Spec.Containers[0].Env { + ev := &deployment.Spec.Template.Spec.Containers[0].Env[i] + if ev.Name == envVarName { + return ev + } + } + return nil + } + + BeforeEach(func() { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: freezeNS}} + _ = k8sClient.Create(ctx, ns) + }) + + cleanupServer := func(key types.NamespacedName) { + fresh := &mcpv1beta1.MCPServer{} + if err := k8sClient.Get(ctx, key, fresh); err != nil { + return + } + if len(fresh.Finalizers) > 0 { + original := fresh.DeepCopy() + fresh.Finalizers = nil + // Test-only teardown: no concurrent writers, so plain MergeFrom is + // fine. Do not copy into reconciler code (see operator rules). + _ = k8sClient.Patch(ctx, fresh, client.MergeFrom(original)) + } + _ = k8sClient.Delete(ctx, fresh) + } + + Context("When the proxy Deployment is created", func() { + It("Stamps the mcpserver-generation annotation and projects it via the downward API", func() { + name := "freeze-initial-reconcile" + server := &mcpv1beta1.MCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: freezeNS}, + Spec: mcpv1beta1.MCPServerSpec{ + Image: "example/mcp-server:v1.0.0", + Transport: "stdio", + ProxyMode: "sse", + ProxyPort: 8080, + MCPPort: 8081, + }, + } + Expect(k8sClient.Create(ctx, server)).To(Succeed()) + key := types.NamespacedName{Name: name, Namespace: freezeNS} + DeferCleanup(func() { cleanupServer(key) }) + + deployment := &appsv1.Deployment{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, key, deployment)).To(Succeed()) + // Wait until the pod template has both annotations the + // controller stamps; otherwise we may sample the deployment + // mid-build. + g.Expect(deployment.Spec.Template.Annotations). + To(HaveKey(annotation)) + }, timeout, interval).Should(Succeed()) + + // The annotation value must equal the live MCPServer.metadata.generation. + fresh := &mcpv1beta1.MCPServer{} + Expect(k8sClient.Get(ctx, key, fresh)).To(Succeed()) + Expect(deployment.Spec.Template.Annotations[annotation]). + To(Equal(strconv.FormatInt(fresh.Generation, 10)), + "pod-template annotation must mirror MCPServer.metadata.generation") + + // The proxyrunner container must carry the downward-API env var + // pointing at that annotation. Field references must use the + // exact annotation key — a typo here silently produces an empty + // env value and the override at run.go falls through to the file + // (defeating the fix). + ev := findGenerationEnvVar(deployment) + Expect(ev).NotTo(BeNil(), + "proxyrunner container must declare the %s env var", envVarName) + Expect(ev.Value).To(BeEmpty(), + "env var must use ValueFrom (downward API), not a literal Value") + Expect(ev.ValueFrom).NotTo(BeNil()) + Expect(ev.ValueFrom.FieldRef).NotTo(BeNil()) + Expect(ev.ValueFrom.FieldRef.FieldPath).To(Equal(expectedFieldPath), + "FieldRef must point at the mcpserver-generation pod annotation") + }) + }) + + Context("When MCPServer.Spec.Image changes (rolling update path)", func() { + It("Bumps the pod-template annotation to the new MCPServer generation", func() { + name := "freeze-spec-bump" + server := &mcpv1beta1.MCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: freezeNS}, + Spec: mcpv1beta1.MCPServerSpec{ + Image: "example/mcp-server:v1", + Transport: "stdio", + ProxyMode: "sse", + ProxyPort: 8080, + MCPPort: 8081, + }, + } + Expect(k8sClient.Create(ctx, server)).To(Succeed()) + key := types.NamespacedName{Name: name, Namespace: freezeNS} + DeferCleanup(func() { cleanupServer(key) }) + + // Wait for the first reconcile to stamp the deployment. + deployment := &appsv1.Deployment{} + var initialGenStr string + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, key, deployment)).To(Succeed()) + g.Expect(deployment.Spec.Template.Annotations).To(HaveKey(annotation)) + initialGenStr = deployment.Spec.Template.Annotations[annotation] + g.Expect(initialGenStr).NotTo(BeEmpty()) + }, timeout, interval).Should(Succeed()) + + fresh := &mcpv1beta1.MCPServer{} + Expect(k8sClient.Get(ctx, key, fresh)).To(Succeed()) + initialGeneration := fresh.Generation + Expect(initialGenStr).To(Equal(strconv.FormatInt(initialGeneration, 10))) + + // Patch Spec.Image. Use a merge patch so other fields aren't + // touched and resourceVersion isn't precondition-checked — we + // expect this to bump .metadata.generation. + Eventually(func() error { + toPatch := &mcpv1beta1.MCPServer{} + if err := k8sClient.Get(ctx, key, toPatch); err != nil { + return err + } + original := toPatch.DeepCopy() + toPatch.Spec.Image = "example/mcp-server:v2" + return k8sClient.Patch(ctx, toPatch, client.MergeFrom(original)) + }, timeout, interval).Should(Succeed()) + + // The controller should re-render the deployment with the new + // generation in the pod-template annotation. This is the value + // that, in production, downward-API-projects into new pods so + // they carry a frozen-per-pod generation strictly greater than + // any restarted old-RS pod. + Eventually(func(g Gomega) { + bumped := &mcpv1beta1.MCPServer{} + g.Expect(k8sClient.Get(ctx, key, bumped)).To(Succeed()) + g.Expect(bumped.Generation).To(BeNumerically(">", initialGeneration), + "patching Spec.Image must bump MCPServer.metadata.generation") + + latest := &appsv1.Deployment{} + g.Expect(k8sClient.Get(ctx, key, latest)).To(Succeed()) + g.Expect(latest.Spec.Template.Annotations[annotation]). + To(Equal(strconv.FormatInt(bumped.Generation, 10)), + "pod-template annotation must track the new MCPServer generation") + + // The downward-API env var must still be wired correctly + // after the rolling-update reconcile. + ev := findGenerationEnvVar(latest) + g.Expect(ev).NotTo(BeNil()) + g.Expect(ev.ValueFrom).NotTo(BeNil()) + g.Expect(ev.ValueFrom.FieldRef).NotTo(BeNil()) + g.Expect(ev.ValueFrom.FieldRef.FieldPath).To(Equal(expectedFieldPath)) + }, timeout, interval).Should(Succeed()) + }) + }) +}) From d2121daf6021437c3e8a85e60e96de2e7b0428b2 Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Thu, 21 May 2026 22:44:10 +0100 Subject: [PATCH 5/9] Address review feedback on #5360 freeze - Defend applyMCPServerGenerationOverride against negative env values. metadata.generation is monotonic non-negative per K8s convention, so a negative override cannot come from a legitimate downward-API projection and must not be allowed to silently disable the apply-gate stamp. Add a Debug log on the success path so the override is observable in pod logs when diagnosing future occurrences. - Add TestApplyMCPServerGenerationOverride exercising the empty, valid, zero, unparseable, and negative branches in isolation. - Soften the position-mirror comment in deploymentNeedsUpdate. The prior text referenced the Redis password slot, but deploymentNeedsUpdate does not include the Redis password env var (pre-existing latent drift bug, tracked separately). - Add a header comment to TestResourceOverrides explaining that the "0" mcpserver-generation annotation value comes from the fake client not auto-incrementing metadata.generation. Realistic generation tracking is exercised by the envtest in mcpserver_generation_freeze_integration_test.go. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../controllers/mcpserver_controller.go | 8 ++--- .../mcpserver_resource_overrides_test.go | 7 ++++ cmd/thv-proxyrunner/app/run.go | 13 +++++++ cmd/thv-proxyrunner/app/run_test.go | 36 +++++++++++++++++++ 4 files changed, 60 insertions(+), 4 deletions(-) diff --git a/cmd/thv-operator/controllers/mcpserver_controller.go b/cmd/thv-operator/controllers/mcpserver_controller.go index 8b8950fb7d..bc4862708f 100644 --- a/cmd/thv-operator/controllers/mcpserver_controller.go +++ b/cmd/thv-operator/controllers/mcpserver_controller.go @@ -1781,10 +1781,10 @@ func (r *MCPServerReconciler) deploymentNeedsUpdate( } // Project the MCPServer generation pod-template annotation into the - // proxyrunner container via the downward API. Position mirrors the - // construction site in deploymentForMCPServer (after Redis password, - // before embedded auth) so DeepEqual against container.Env succeeds. - // See #5360. + // proxyrunner container via the downward API. Position must come + // before the embedded-auth env vars below so the slice order matches + // deploymentForMCPServer and equality.Semantic.DeepEqual against + // container.Env succeeds. See #5360. expectedProxyEnv = append(expectedProxyEnv, corev1.EnvVar{ Name: kubernetes.EnvVarMCPServerGeneration, ValueFrom: &corev1.EnvVarSource{ diff --git a/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go b/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go index 671d3c90a6..2cd297c203 100644 --- a/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go +++ b/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go @@ -222,6 +222,13 @@ func TestResourceOverrides(t *testing.T) { scheme := runtime.NewScheme() require.NoError(t, mcpv1beta1.AddToScheme(scheme)) + // Note: expectedPodTemplateAnns entries below carry + // "toolhive.stacklok.dev/mcpserver-generation": "0" because the controller + // stamps strconv.FormatInt(m.Generation, 10) and the fake client does not + // auto-increment metadata.generation on Create (the real API server starts + // at 1). Envtest coverage in + // cmd/thv-operator/test-integration/mcp-server/mcpserver_generation_freeze_integration_test.go + // exercises the realistic generation-tracking behavior. tests := []struct { name string mcpServer *mcpv1beta1.MCPServer diff --git a/cmd/thv-proxyrunner/app/run.go b/cmd/thv-proxyrunner/app/run.go index 3a103d060c..84897708ce 100644 --- a/cmd/thv-proxyrunner/app/run.go +++ b/cmd/thv-proxyrunner/app/run.go @@ -182,6 +182,19 @@ func applyMCPServerGenerationOverride(runConfig *runner.RunConfig) { "env", kubernetes.EnvVarMCPServerGeneration, "value", raw, "err", err) return } + // metadata.generation is a monotonic non-negative integer per the K8s API + // convention. A negative value cannot have come from a legitimate downward + // API projection of the pod annotation and would silently disable the + // apply-gate stamp at pkg/container/kubernetes/client.go:479-482. + if gen < 0 { + slog.Warn("ignoring negative env var; falling back to runconfig value", + "env", kubernetes.EnvVarMCPServerGeneration, "value", raw) + return + } + slog.Debug("applied MCPServer generation override from env var", + "env", kubernetes.EnvVarMCPServerGeneration, + "file_value", runConfig.MCPServerGeneration, + "env_value", gen) runConfig.MCPServerGeneration = gen } diff --git a/cmd/thv-proxyrunner/app/run_test.go b/cmd/thv-proxyrunner/app/run_test.go index bead187b64..95219e5b3f 100644 --- a/cmd/thv-proxyrunner/app/run_test.go +++ b/cmd/thv-proxyrunner/app/run_test.go @@ -63,3 +63,39 @@ func TestTryLoadConfigFromFile_MCPServerGenerationEnvOverride(t *testing.T) { "two proxyrunner pods that have both re-read the live-mounted "+ "ConfigMap (issue #5360)") } + +// TestApplyMCPServerGenerationOverride exercises the override helper in +// isolation, covering the defensive-validation branches: empty env (no-op), +// unparseable env (fall through), negative env (fall through), and the +// happy path. metadata.generation is a monotonic non-negative integer per +// the K8s API convention, so a negative value cannot have come from a +// legitimate downward-API projection and must not be allowed to silently +// disable the apply-gate stamp. +func TestApplyMCPServerGenerationOverride(t *testing.T) { + tests := []struct { + name string + envValue string // "" means don't set + fileGen int64 + wantGen int64 + }{ + {name: "env unset preserves file value", envValue: "", fileGen: 5, wantGen: 5}, + {name: "valid env overrides file", envValue: "3", fileGen: 5, wantGen: 3}, + {name: "zero env overrides file (caller's choice)", envValue: "0", fileGen: 5, wantGen: 0}, + {name: "unparseable env preserves file value", envValue: "not-a-number", fileGen: 5, wantGen: 5}, + {name: "negative env preserves file value", envValue: "-1", fileGen: 5, wantGen: 5}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + if tc.envValue != "" { + t.Setenv(kubernetes.EnvVarMCPServerGeneration, tc.envValue) + } else { + // Explicitly clear so a stray env from the host doesn't leak. + t.Setenv(kubernetes.EnvVarMCPServerGeneration, "") + } + cfg := &runner.RunConfig{MCPServerGeneration: tc.fileGen} + applyMCPServerGenerationOverride(cfg) + assert.Equal(t, tc.wantGen, cfg.MCPServerGeneration) + }) + } +} From 010f9890e926f3454964c4009d2bcdc88aa9a99a Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Thu, 21 May 2026 23:04:14 +0100 Subject: [PATCH 6/9] Fix CI: codespell typo and gofmt alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - "unparseable" → "unparsable" (codespell) in run.go, run_test.go. - Realign env-var map literals to gofmt's preferred column for the longest key (THV_MCPSERVER_GENERATION). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../mcpserver_resource_overrides_test.go | 42 +++++++++---------- cmd/thv-proxyrunner/app/run.go | 2 +- cmd/thv-proxyrunner/app/run_test.go | 4 +- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go b/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go index 2cd297c203..4ae5a3a339 100644 --- a/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go +++ b/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go @@ -595,33 +595,33 @@ func TestResourceOverrides(t *testing.T) { switch tt.name { case "with proxy environment variables": expectedEnvVars = map[string]string{ - "HTTP_PROXY": "http://proxy.example.com:8080", - "NO_PROXY": "localhost,127.0.0.1", - "CUSTOM_ENV": "custom-value", - "THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set - "XDG_CONFIG_HOME": "/tmp", - "HOME": "/tmp", - "TOOLHIVE_RUNTIME": "kubernetes", - "UNSTRUCTURED_LOGS": "false", + "HTTP_PROXY": "http://proxy.example.com:8080", + "NO_PROXY": "localhost,127.0.0.1", + "CUSTOM_ENV": "custom-value", + "THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set + "XDG_CONFIG_HOME": "/tmp", + "HOME": "/tmp", + "TOOLHIVE_RUNTIME": "kubernetes", + "UNSTRUCTURED_LOGS": "false", } case "with debug logging via TOOLHIVE_DEBUG env var": expectedEnvVars = map[string]string{ - "TOOLHIVE_DEBUG": "true", - "THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set - "XDG_CONFIG_HOME": "/tmp", - "HOME": "/tmp", - "TOOLHIVE_RUNTIME": "kubernetes", - "UNSTRUCTURED_LOGS": "false", + "TOOLHIVE_DEBUG": "true", + "THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set + "XDG_CONFIG_HOME": "/tmp", + "HOME": "/tmp", + "TOOLHIVE_RUNTIME": "kubernetes", + "UNSTRUCTURED_LOGS": "false", } default: expectedEnvVars = map[string]string{ - "LOG_LEVEL": "debug", - "METRICS_ENABLED": "true", - "THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set - "XDG_CONFIG_HOME": "/tmp", - "HOME": "/tmp", - "TOOLHIVE_RUNTIME": "kubernetes", - "UNSTRUCTURED_LOGS": "false", + "LOG_LEVEL": "debug", + "METRICS_ENABLED": "true", + "THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set + "XDG_CONFIG_HOME": "/tmp", + "HOME": "/tmp", + "TOOLHIVE_RUNTIME": "kubernetes", + "UNSTRUCTURED_LOGS": "false", } } diff --git a/cmd/thv-proxyrunner/app/run.go b/cmd/thv-proxyrunner/app/run.go index 84897708ce..6e1875f327 100644 --- a/cmd/thv-proxyrunner/app/run.go +++ b/cmd/thv-proxyrunner/app/run.go @@ -178,7 +178,7 @@ func applyMCPServerGenerationOverride(runConfig *runner.RunConfig) { } gen, err := strconv.ParseInt(raw, 10, 64) if err != nil { - slog.Warn("ignoring unparseable env var; falling back to runconfig value", + slog.Warn("ignoring unparsable env var; falling back to runconfig value", "env", kubernetes.EnvVarMCPServerGeneration, "value", raw, "err", err) return } diff --git a/cmd/thv-proxyrunner/app/run_test.go b/cmd/thv-proxyrunner/app/run_test.go index 95219e5b3f..a6f9859c53 100644 --- a/cmd/thv-proxyrunner/app/run_test.go +++ b/cmd/thv-proxyrunner/app/run_test.go @@ -66,7 +66,7 @@ func TestTryLoadConfigFromFile_MCPServerGenerationEnvOverride(t *testing.T) { // TestApplyMCPServerGenerationOverride exercises the override helper in // isolation, covering the defensive-validation branches: empty env (no-op), -// unparseable env (fall through), negative env (fall through), and the +// unparsable env (fall through), negative env (fall through), and the // happy path. metadata.generation is a monotonic non-negative integer per // the K8s API convention, so a negative value cannot have come from a // legitimate downward-API projection and must not be allowed to silently @@ -81,7 +81,7 @@ func TestApplyMCPServerGenerationOverride(t *testing.T) { {name: "env unset preserves file value", envValue: "", fileGen: 5, wantGen: 5}, {name: "valid env overrides file", envValue: "3", fileGen: 5, wantGen: 3}, {name: "zero env overrides file (caller's choice)", envValue: "0", fileGen: 5, wantGen: 0}, - {name: "unparseable env preserves file value", envValue: "not-a-number", fileGen: 5, wantGen: 5}, + {name: "unparsable env preserves file value", envValue: "not-a-number", fileGen: 5, wantGen: 5}, {name: "negative env preserves file value", envValue: "-1", fileGen: 5, wantGen: 5}, } From cc7c9f4dbc4ee5ea8a4804e6a4b6330abf4e57cf Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Thu, 21 May 2026 23:41:51 +0100 Subject: [PATCH 7/9] Set FieldRef.APIVersion=v1 to avoid perpetual drift MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ObjectFieldSelector.APIVersion is defaulted to "v1" by the API server on persistence. The drift comparator at deploymentNeedsUpdate was rebuilding the expected env var with an empty APIVersion, so equality.Semantic.DeepEqual returned false on every reconcile and the controller perpetually rewrote the proxy Deployment. That constant churn re-triggered the exact rolling- update race the freeze was supposed to close — confirmed by the failing VirtualMCPServer Optimizer + Circuit Breaker recovery spec in test-e2e-lifecycle: stale-image proxyrunner pods kept clobbering the new StatefulSet apply, leaving the backend mcp container in ImagePullBackOff. Explicitly set APIVersion: "v1" on both the construction site and the drift comparator so they match the API-server-defaulted value. Add unit-level assertion in TestDeploymentForMCPServer_MCPServerGenerationDownwardAPI and a new envtest spec "Does not flag spurious drift on a no-op reconcile" that watches the Deployment's resourceVersion across multiple reconciles and fails fast if drift detection misfires. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../controllers/mcpserver_controller.go | 12 +++- .../mcpserver_resource_overrides_test.go | 6 ++ ...rver_generation_freeze_integration_test.go | 55 +++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/cmd/thv-operator/controllers/mcpserver_controller.go b/cmd/thv-operator/controllers/mcpserver_controller.go index bc4862708f..b184deb98b 100644 --- a/cmd/thv-operator/controllers/mcpserver_controller.go +++ b/cmd/thv-operator/controllers/mcpserver_controller.go @@ -1156,10 +1156,15 @@ func (r *MCPServerReconciler) deploymentForMCPServer( // proxyrunner container via the downward API. The proxyrunner uses this // to override the value read from the live-mounted RunConfig ConfigMap, // freezing it per pod at creation time. See #5360. + // + // APIVersion must be explicitly "v1" — the API server defaults it on + // persistence and equality.Semantic.DeepEqual treats "" != "v1" as drift, + // which would otherwise force a Deployment update on every reconcile. env = append(env, corev1.EnvVar{ Name: kubernetes.EnvVarMCPServerGeneration, ValueFrom: &corev1.EnvVarSource{ FieldRef: &corev1.ObjectFieldSelector{ + APIVersion: "v1", FieldPath: fmt.Sprintf("metadata.annotations['%s']", kubernetes.RunConfigMCPServerGenerationAnnotation), }, @@ -1784,11 +1789,16 @@ func (r *MCPServerReconciler) deploymentNeedsUpdate( // proxyrunner container via the downward API. Position must come // before the embedded-auth env vars below so the slice order matches // deploymentForMCPServer and equality.Semantic.DeepEqual against - // container.Env succeeds. See #5360. + // container.Env succeeds. + // + // APIVersion must mirror the construction site at "v1" — the API + // server defaults it on persistence and an empty string here would + // produce false drift on every reconcile. See #5360. expectedProxyEnv = append(expectedProxyEnv, corev1.EnvVar{ Name: kubernetes.EnvVarMCPServerGeneration, ValueFrom: &corev1.EnvVarSource{ FieldRef: &corev1.ObjectFieldSelector{ + APIVersion: "v1", FieldPath: fmt.Sprintf("metadata.annotations['%s']", kubernetes.RunConfigMCPServerGenerationAnnotation), }, diff --git a/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go b/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go index 4ae5a3a339..9a24033853 100644 --- a/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go +++ b/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go @@ -1213,4 +1213,10 @@ func TestDeploymentForMCPServer_MCPServerGenerationDownwardAPI(t *testing.T) { "metadata.annotations['"+kubernetes.RunConfigMCPServerGenerationAnnotation+"']", got.ValueFrom.FieldRef.FieldPath, "FieldRef must point at the mcpserver-generation pod annotation") + // APIVersion must be set explicitly so the drift comparator at + // deploymentNeedsUpdate matches the API-server-defaulted value. An empty + // APIVersion here results in equality.Semantic.DeepEqual returning false on + // every reconcile, causing perpetual Deployment rewrites. See #5360. + assert.Equal(t, "v1", got.ValueFrom.FieldRef.APIVersion, + "FieldRef.APIVersion must match the API server default of v1 to avoid false drift") } diff --git a/cmd/thv-operator/test-integration/mcp-server/mcpserver_generation_freeze_integration_test.go b/cmd/thv-operator/test-integration/mcp-server/mcpserver_generation_freeze_integration_test.go index 49e95ca398..3748a50df9 100644 --- a/cmd/thv-operator/test-integration/mcp-server/mcpserver_generation_freeze_integration_test.go +++ b/cmd/thv-operator/test-integration/mcp-server/mcpserver_generation_freeze_integration_test.go @@ -126,6 +126,61 @@ var _ = Describe("MCPServer generation freeze (#5360)", func() { }) }) + Context("After the initial reconcile settles", func() { + It("Does not flag spurious drift on a no-op reconcile (#5360 regression)", func() { + // Regression test for a defaulting trap: ObjectFieldSelector.APIVersion + // is defaulted to "v1" by the API server on persistence. If the + // drift-check code rebuilds the env var with an empty APIVersion, + // equality.Semantic.DeepEqual returns false on every reconcile and + // the controller perpetually re-applies the Deployment, defeating + // the rolling-update freeze the env var is supposed to provide. + name := "freeze-no-drift" + server := &mcpv1beta1.MCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: freezeNS}, + Spec: mcpv1beta1.MCPServerSpec{ + Image: "example/mcp-server:v1.0.0", + Transport: "stdio", + ProxyMode: "sse", + ProxyPort: 8080, + MCPPort: 8081, + }, + } + Expect(k8sClient.Create(ctx, server)).To(Succeed()) + key := types.NamespacedName{Name: name, Namespace: freezeNS} + DeferCleanup(func() { cleanupServer(key) }) + + deployment := &appsv1.Deployment{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, key, deployment)).To(Succeed()) + g.Expect(deployment.Spec.Template.Annotations).To(HaveKey(annotation)) + }, timeout, interval).Should(Succeed()) + + // Capture the post-reconcile resourceVersion. + settledRV := deployment.ResourceVersion + + // Give the controller a few extra reconciles to run. If drift + // detection is broken, each reconcile would re-Update the + // Deployment and bump its resourceVersion. + Consistently(func(g Gomega) { + latest := &appsv1.Deployment{} + g.Expect(k8sClient.Get(ctx, key, latest)).To(Succeed()) + g.Expect(latest.ResourceVersion).To(Equal(settledRV), + "Deployment must not be rewritten on no-op reconciles; "+ + "check ObjectFieldSelector.APIVersion defaulting against the drift comparator") + }, time.Second*5, interval).Should(Succeed()) + + // Sanity-check that the persisted env var has the API-server-defaulted + // APIVersion. This is the value the drift comparator must match. + ev := findGenerationEnvVar(deployment) + Expect(ev).NotTo(BeNil()) + Expect(ev.ValueFrom).NotTo(BeNil()) + Expect(ev.ValueFrom.FieldRef).NotTo(BeNil()) + Expect(ev.ValueFrom.FieldRef.APIVersion).To(Equal("v1"), + "API server defaults ObjectFieldSelector.APIVersion to v1; "+ + "the operator must construct the env var with the same value to avoid false drift") + }) + }) + Context("When MCPServer.Spec.Image changes (rolling update path)", func() { It("Bumps the pod-template annotation to the new MCPServer generation", func() { name := "freeze-spec-bump" From 8ed6e39ef758b3e5882a232058f5b0a3555a383e Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Fri, 22 May 2026 15:21:15 +0100 Subject: [PATCH 8/9] TEMP debug: promote gate skip log + override log to INFO Temporary diagnostic for #5360 v1.33/v1.34 E2E failure. Will be reverted before merge. --- cmd/thv-proxyrunner/app/run.go | 3 ++- pkg/container/kubernetes/client.go | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/cmd/thv-proxyrunner/app/run.go b/cmd/thv-proxyrunner/app/run.go index 6e1875f327..eeb8735718 100644 --- a/cmd/thv-proxyrunner/app/run.go +++ b/cmd/thv-proxyrunner/app/run.go @@ -191,7 +191,8 @@ func applyMCPServerGenerationOverride(runConfig *runner.RunConfig) { "env", kubernetes.EnvVarMCPServerGeneration, "value", raw) return } - slog.Debug("applied MCPServer generation override from env var", + // TEMPORARY: INFO level for #5360 diagnosis. Demote to Debug before merge. + slog.Info("applied MCPServer generation override from env var", "env", kubernetes.EnvVarMCPServerGeneration, "file_value", runConfig.MCPServerGeneration, "env_value", gen) diff --git a/pkg/container/kubernetes/client.go b/pkg/container/kubernetes/client.go index 0b6d72cd09..1a529b4993 100644 --- a/pkg/container/kubernetes/client.go +++ b/pkg/container/kubernetes/client.go @@ -543,10 +543,14 @@ func (c *Client) shouldSkipStatefulSetApply( return false, nil } if theirsGen > ourGen { - slog.Debug("skipping StatefulSet apply; newer MCPServer generation already applied", + // TEMPORARY: INFO level for #5360 diagnosis. Demote to Debug before merge. + slog.Info("skipping StatefulSet apply; newer MCPServer generation already applied", "sts", name, "ours", ourGen, "theirs", theirsGen) return true, nil } + // TEMPORARY: INFO level for #5360 diagnosis. Demote to Debug before merge. + slog.Info("StatefulSet apply proceeding", + "sts", name, "ours", ourGen, "theirs", theirsGen) return false, nil } From 8634d697e768b60e8df9442e4bbb073314cb353a Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Fri, 22 May 2026 15:52:16 +0100 Subject: [PATCH 9/9] Revert debug INFO logs; port STS-template wait to optimizer test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The debug commit (8ed6e39e) confirmed the override + gate are working correctly end-to-end on a real cluster: pod env=3 (latest): file=3 env=3, apply proceeds — writes GOOD image pod env=2 (stale): file=3 env=2, gate skips (theirs=3 > ours=2) Revert the INFO promotion now that diagnosis is complete; both logs go back to Debug. The remaining E2E flake is a pre-existing race in virtualmcp_optimizer_circuit_breaker_test.go: the test deletes the Pending StatefulSet pod immediately after patching Spec.Image without waiting for the proxyrunner to apply the new template, so the pod can be recreated against the stale (bad) template and the StatefulSet controller may not re-roll an already-unhealthy pod afterwards. Mirrors the same guard added to virtualmcp_circuit_breaker_test.go in #5079. Before the apply-gate, the race "resolved" by the stale-image apply periodically resetting the template, letting the test catch a lucky good window. With the gate in place the apply order is deterministic, which makes the missing wait visible — hence the test fix lands here. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/thv-proxyrunner/app/run.go | 3 +-- pkg/container/kubernetes/client.go | 6 +---- ...rtualmcp_optimizer_circuit_breaker_test.go | 27 +++++++++++++++++++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/cmd/thv-proxyrunner/app/run.go b/cmd/thv-proxyrunner/app/run.go index eeb8735718..6e1875f327 100644 --- a/cmd/thv-proxyrunner/app/run.go +++ b/cmd/thv-proxyrunner/app/run.go @@ -191,8 +191,7 @@ func applyMCPServerGenerationOverride(runConfig *runner.RunConfig) { "env", kubernetes.EnvVarMCPServerGeneration, "value", raw) return } - // TEMPORARY: INFO level for #5360 diagnosis. Demote to Debug before merge. - slog.Info("applied MCPServer generation override from env var", + slog.Debug("applied MCPServer generation override from env var", "env", kubernetes.EnvVarMCPServerGeneration, "file_value", runConfig.MCPServerGeneration, "env_value", gen) diff --git a/pkg/container/kubernetes/client.go b/pkg/container/kubernetes/client.go index 1a529b4993..0b6d72cd09 100644 --- a/pkg/container/kubernetes/client.go +++ b/pkg/container/kubernetes/client.go @@ -543,14 +543,10 @@ func (c *Client) shouldSkipStatefulSetApply( return false, nil } if theirsGen > ourGen { - // TEMPORARY: INFO level for #5360 diagnosis. Demote to Debug before merge. - slog.Info("skipping StatefulSet apply; newer MCPServer generation already applied", + slog.Debug("skipping StatefulSet apply; newer MCPServer generation already applied", "sts", name, "ours", ourGen, "theirs", theirsGen) return true, nil } - // TEMPORARY: INFO level for #5360 diagnosis. Demote to Debug before merge. - slog.Info("StatefulSet apply proceeding", - "sts", name, "ours", ourGen, "theirs", theirsGen) return false, nil } diff --git a/test/e2e/thv-operator/virtualmcp/virtualmcp_optimizer_circuit_breaker_test.go b/test/e2e/thv-operator/virtualmcp/virtualmcp_optimizer_circuit_breaker_test.go index 1886fa492a..4e763c220e 100644 --- a/test/e2e/thv-operator/virtualmcp/virtualmcp_optimizer_circuit_breaker_test.go +++ b/test/e2e/thv-operator/virtualmcp/virtualmcp_optimizer_circuit_breaker_test.go @@ -10,6 +10,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -297,6 +298,32 @@ var _ = Describe("VirtualMCPServer Optimizer with Circuit Breaker", Ordered, fun backend.Spec.Image = images.YardstickServerImage Expect(k8sClient.Update(ctx, backend)).To(Succeed()) + By("Waiting for backend StatefulSet template to use the fixed image") + // Without this wait we can race the proxyrunner: deleting the + // Pending pod before the StatefulSet template has flipped to the + // good image just causes it to be recreated against the old (bad) + // template, and the StatefulSet controller may not re-roll an + // already-unhealthy pod afterwards. Mirrors the same guard in + // virtualmcp_circuit_breaker_test.go added in #5079. + Eventually(func() error { + sts := &appsv1.StatefulSet{} + if err := k8sClient.Get(ctx, types.NamespacedName{ + Name: unstableName, + Namespace: testNamespace, + }, sts); err != nil { + return err + } + for _, container := range sts.Spec.Template.Spec.Containers { + if container.Name == "mcp" { + if container.Image != images.YardstickServerImage { + return fmt.Errorf("statefulset still has image %q", container.Image) + } + return nil + } + } + return fmt.Errorf("mcp container not found in statefulset template") + }, timeout, pollingInterval).Should(Succeed()) + By("Deleting stuck pods to force recreation with fixed image") podList := &corev1.PodList{} Expect(k8sClient.List(ctx, podList,