Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions cmd/thv-operator/controllers/mcpserver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"fmt"
"maps"
"os"
"strconv"
"strings"
"time"

Expand Down Expand Up @@ -1151,6 +1152,25 @@ func (r *MCPServerReconciler) deploymentForMCPServer(
// name collision (ResourceOverrides.Env only accepts plain strings).
env = append(env, r.buildRedisPasswordEnvVar(m)...)

// Project the MCPServer generation pod-template annotation into the
// proxyrunner container via the downward API. The proxyrunner uses this
// to override the value read from the live-mounted RunConfig ConfigMap,
// freezing it per pod at creation time. See #5360.
//
// APIVersion must be explicitly "v1" — the API server defaults it on
// persistence and equality.Semantic.DeepEqual treats "" != "v1" as drift,
// which would otherwise force a Deployment update on every reconcile.
env = append(env, corev1.EnvVar{
Name: kubernetes.EnvVarMCPServerGeneration,
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
APIVersion: "v1",
FieldPath: fmt.Sprintf("metadata.annotations['%s']",
kubernetes.RunConfigMCPServerGenerationAnnotation),
},
},
})

// Add volume mounts for user-defined volumes
for _, v := range m.Spec.Volumes {
volumeMounts = append(volumeMounts, corev1.VolumeMount{
Expand Down Expand Up @@ -1264,6 +1284,12 @@ func (r *MCPServerReconciler) deploymentForMCPServer(
// Add RunConfig checksum annotation to trigger pod rollout when config changes
deploymentTemplateAnnotations = checksum.AddRunConfigChecksumToPodTemplate(deploymentTemplateAnnotations, runConfigChecksum)

// Stamp the MCPServer generation on the proxy Deployment's pod template so the
// downward-API env var below resolves to a value that is frozen at pod creation
// time, not live-updated like the runconfig.json ConfigMap mount. See #5360.
deploymentTemplateAnnotations[kubernetes.RunConfigMCPServerGenerationAnnotation] =
strconv.FormatInt(m.Generation, 10)

if m.Spec.ResourceOverrides != nil && m.Spec.ResourceOverrides.ProxyDeployment != nil {
if m.Spec.ResourceOverrides.ProxyDeployment.Labels != nil {
deploymentLabels = ctrlutil.MergeLabels(ls, m.Spec.ResourceOverrides.ProxyDeployment.Labels)
Expand Down Expand Up @@ -1759,6 +1785,26 @@ func (r *MCPServerReconciler) deploymentNeedsUpdate(
}
}

// Project the MCPServer generation pod-template annotation into the
// proxyrunner container via the downward API. Position must come
// before the embedded-auth env vars below so the slice order matches
// deploymentForMCPServer and equality.Semantic.DeepEqual against
// container.Env succeeds.
//
// APIVersion must mirror the construction site at "v1" — the API
// server defaults it on persistence and an empty string here would
// produce false drift on every reconcile. See #5360.
expectedProxyEnv = append(expectedProxyEnv, corev1.EnvVar{
Comment thread
ChrisJBurns marked this conversation as resolved.
Name: kubernetes.EnvVarMCPServerGeneration,
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
APIVersion: "v1",
FieldPath: fmt.Sprintf("metadata.annotations['%s']",
kubernetes.RunConfigMCPServerGenerationAnnotation),
},
},
})

// Add embedded auth server environment variables. AuthServerRef takes precedence;
// externalAuthConfigRef is used as a fallback (legacy path).
if configName := ctrlutil.EmbeddedAuthServerConfigName(
Expand Down Expand Up @@ -1879,6 +1925,11 @@ func (r *MCPServerReconciler) deploymentNeedsUpdate(
// Check if pod template annotations have changed (including runconfig checksum)
expectedPodTemplateAnnotations := make(map[string]string)
expectedPodTemplateAnnotations = checksum.AddRunConfigChecksumToPodTemplate(expectedPodTemplateAnnotations, runConfigChecksum)
// Mirrors deploymentForMCPServer: stamp the MCPServer generation so the
// downward-API env var injected into the proxyrunner container resolves
// to a frozen-per-pod value (#5360).
expectedPodTemplateAnnotations[kubernetes.RunConfigMCPServerGenerationAnnotation] =
strconv.FormatInt(mcpServer.Generation, 10)

if mcpServer.Spec.ResourceOverrides != nil &&
mcpServer.Spec.ResourceOverrides.ProxyDeployment != nil &&
Expand Down
134 changes: 107 additions & 27 deletions cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,13 @@ func TestResourceOverrides(t *testing.T) {
scheme := runtime.NewScheme()
require.NoError(t, mcpv1beta1.AddToScheme(scheme))

// Note: expectedPodTemplateAnns entries below carry
// "toolhive.stacklok.dev/mcpserver-generation": "0" because the controller
// stamps strconv.FormatInt(m.Generation, 10) and the fake client does not
// auto-increment metadata.generation on Create (the real API server starts
// at 1). Envtest coverage in
// cmd/thv-operator/test-integration/mcp-server/mcpserver_generation_freeze_integration_test.go
// exercises the realistic generation-tracking behavior.
tests := []struct {
name string
mcpServer *mcpv1beta1.MCPServer
Expand Down Expand Up @@ -252,7 +259,8 @@ func TestResourceOverrides(t *testing.T) {
},
expectedDeploymentAnns: map[string]string{},
expectedPodTemplateAnns: map[string]string{
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/mcpserver-generation": "0",
Comment thread
ChrisJBurns marked this conversation as resolved.
},
expectedServiceLabels: map[string]string{
"app": "mcpserver",
Expand Down Expand Up @@ -315,7 +323,8 @@ func TestResourceOverrides(t *testing.T) {
"monitoring/scrape": "true",
},
expectedPodTemplateAnns: map[string]string{
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/mcpserver-generation": "0",
},
expectedServiceLabels: map[string]string{
"app": "mcpserver",
Expand Down Expand Up @@ -376,7 +385,8 @@ func TestResourceOverrides(t *testing.T) {
},
expectedDeploymentAnns: map[string]string{},
expectedPodTemplateAnns: map[string]string{
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/mcpserver-generation": "0",
},
expectedServiceLabels: map[string]string{
"app": "mcpserver",
Expand Down Expand Up @@ -415,7 +425,8 @@ func TestResourceOverrides(t *testing.T) {
},
expectedDeploymentAnns: map[string]string{},
expectedPodTemplateAnns: map[string]string{
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/mcpserver-generation": "0",
},
expectedServiceLabels: map[string]string{
"app": "mcpserver",
Expand Down Expand Up @@ -481,7 +492,8 @@ func TestResourceOverrides(t *testing.T) {
"version": "v1.2.3",
},
expectedPodTemplateAnns: map[string]string{
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/mcpserver-generation": "0",
},
expectedServiceLabels: map[string]string{
"app": "mcpserver",
Expand Down Expand Up @@ -525,9 +537,10 @@ func TestResourceOverrides(t *testing.T) {
},
expectedDeploymentAnns: map[string]string{},
expectedPodTemplateAnns: map[string]string{
"vault.hashicorp.com/agent-inject": "true",
"vault.hashicorp.com/role": "toolhive-mcp-workloads",
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"vault.hashicorp.com/agent-inject": "true",
"vault.hashicorp.com/role": "toolhive-mcp-workloads",
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/mcpserver-generation": "0",
},
expectedServiceLabels: map[string]string{
"app": "mcpserver",
Expand Down Expand Up @@ -582,30 +595,33 @@ func TestResourceOverrides(t *testing.T) {
switch tt.name {
case "with proxy environment variables":
expectedEnvVars = map[string]string{
"HTTP_PROXY": "http://proxy.example.com:8080",
"NO_PROXY": "localhost,127.0.0.1",
"CUSTOM_ENV": "custom-value",
"XDG_CONFIG_HOME": "/tmp",
"HOME": "/tmp",
"TOOLHIVE_RUNTIME": "kubernetes",
"UNSTRUCTURED_LOGS": "false",
"HTTP_PROXY": "http://proxy.example.com:8080",
"NO_PROXY": "localhost,127.0.0.1",
"CUSTOM_ENV": "custom-value",
"THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set
"XDG_CONFIG_HOME": "/tmp",
"HOME": "/tmp",
"TOOLHIVE_RUNTIME": "kubernetes",
"UNSTRUCTURED_LOGS": "false",
}
case "with debug logging via TOOLHIVE_DEBUG env var":
expectedEnvVars = map[string]string{
"TOOLHIVE_DEBUG": "true",
"XDG_CONFIG_HOME": "/tmp",
"HOME": "/tmp",
"TOOLHIVE_RUNTIME": "kubernetes",
"UNSTRUCTURED_LOGS": "false",
"TOOLHIVE_DEBUG": "true",
"THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set
"XDG_CONFIG_HOME": "/tmp",
"HOME": "/tmp",
"TOOLHIVE_RUNTIME": "kubernetes",
"UNSTRUCTURED_LOGS": "false",
}
default:
expectedEnvVars = map[string]string{
"LOG_LEVEL": "debug",
"METRICS_ENABLED": "true",
"XDG_CONFIG_HOME": "/tmp",
"HOME": "/tmp",
"TOOLHIVE_RUNTIME": "kubernetes",
"UNSTRUCTURED_LOGS": "false",
"LOG_LEVEL": "debug",
"METRICS_ENABLED": "true",
"THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set
"XDG_CONFIG_HOME": "/tmp",
"HOME": "/tmp",
"TOOLHIVE_RUNTIME": "kubernetes",
"UNSTRUCTURED_LOGS": "false",
}
}

Expand Down Expand Up @@ -657,7 +673,10 @@ func TestDeploymentForMCPServer_PodTemplateOverridesPreserveRunConfigChecksum(t
assert.Equal(t, "value",
deployment.Spec.Template.Annotations["user.example.com/some-key"],
"user override must survive")
assert.Len(t, deployment.Spec.Template.Annotations, 2,
assert.Contains(t, deployment.Spec.Template.Annotations,
kubernetes.RunConfigMCPServerGenerationAnnotation,
"mcpserver-generation must be stamped for the downward-API env var (#5360)")
assert.Len(t, deployment.Spec.Template.Annotations, 3,
"no extra keys should leak into the pod template")
}

Expand Down Expand Up @@ -1140,3 +1159,64 @@ func TestMCPServerServiceNeedsUpdate(t *testing.T) {
})
}
}

// TestDeploymentForMCPServer_MCPServerGenerationDownwardAPI verifies that the
// proxy Deployment stamps the MCPServer generation as a pod-template annotation
// AND projects that annotation into the proxyrunner container as the
// THV_MCPSERVER_GENERATION env var via the downward API. This is the
// frozen-per-pod path that closes the race described in #5360 — the env var's
// value is bound to the pod's own annotations at creation time, so a restarted
// old-RS pod cannot acquire the new generation by re-reading the live-mounted
// RunConfig ConfigMap.
func TestDeploymentForMCPServer_MCPServerGenerationDownwardAPI(t *testing.T) {
t.Parallel()

scheme := runtime.NewScheme()
require.NoError(t, mcpv1beta1.AddToScheme(scheme))
require.NoError(t, corev1.AddToScheme(scheme))

client := fake.NewClientBuilder().WithScheme(scheme).Build()
r := newTestMCPServerReconciler(client, scheme, kubernetes.PlatformKubernetes)

mcpServer := &mcpv1beta1.MCPServer{
ObjectMeta: metav1.ObjectMeta{
Name: "test-server",
Namespace: "default",
Generation: 7,
},
Spec: mcpv1beta1.MCPServerSpec{
Image: "test-image",
ProxyPort: 8080,
},
}

deployment, err := r.deploymentForMCPServer(t.Context(), mcpServer, "test-checksum")
require.NoError(t, err)
require.NotNil(t, deployment)

assert.Equal(t, "7",
deployment.Spec.Template.Annotations[kubernetes.RunConfigMCPServerGenerationAnnotation],
"pod template must stamp the MCPServer generation so the downward-API env var resolves")

require.Len(t, deployment.Spec.Template.Spec.Containers, 1)
var got *corev1.EnvVar
for i := range deployment.Spec.Template.Spec.Containers[0].Env {
if deployment.Spec.Template.Spec.Containers[0].Env[i].Name == kubernetes.EnvVarMCPServerGeneration {
got = &deployment.Spec.Template.Spec.Containers[0].Env[i]
break
}
}
require.NotNil(t, got, "container must declare the %s env var", kubernetes.EnvVarMCPServerGeneration)
require.NotNil(t, got.ValueFrom, "env var must use ValueFrom (downward API), not a literal Value")
require.NotNil(t, got.ValueFrom.FieldRef)
assert.Equal(t,
"metadata.annotations['"+kubernetes.RunConfigMCPServerGenerationAnnotation+"']",
got.ValueFrom.FieldRef.FieldPath,
"FieldRef must point at the mcpserver-generation pod annotation")
// APIVersion must be set explicitly so the drift comparator at
// deploymentNeedsUpdate matches the API-server-defaulted value. An empty
// APIVersion here results in equality.Semantic.DeepEqual returning false on
// every reconcile, causing perpetual Deployment rewrites. See #5360.
assert.Equal(t, "v1", got.ValueFrom.FieldRef.APIVersion,
"FieldRef.APIVersion must match the API server default of v1 to avoid false drift")
}
Loading
Loading