diff --git a/cmd/thv-operator/api/v1beta1/embeddingserver_types.go b/cmd/thv-operator/api/v1beta1/embeddingserver_types.go index 297579303c..a275fb91f7 100644 --- a/cmd/thv-operator/api/v1beta1/embeddingserver_types.go +++ b/cmd/thv-operator/api/v1beta1/embeddingserver_types.go @@ -211,6 +211,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=emb;embedding,categories=toolhive //+kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Model",type="string",JSONPath=".spec.model" diff --git a/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go b/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go index a4e823ac14..7ee2f65fde 100644 --- a/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go @@ -1058,6 +1058,7 @@ type MCPExternalAuthConfigStatus struct { // +kubebuilder:object:root=true // +kubebuilder:storageversion // +kubebuilder:subresource:status +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true // +kubebuilder:resource:shortName=extauth;mcpextauth,categories=toolhive // +kubebuilder:printcolumn:name="Type",type=string,JSONPath=`.spec.type` // +kubebuilder:printcolumn:name="Valid",type=string,JSONPath=`.status.conditions[?(@.type=='Valid')].status` diff --git a/cmd/thv-operator/api/v1beta1/mcpgroup_types.go b/cmd/thv-operator/api/v1beta1/mcpgroup_types.go index e0900f0e05..ddbecce45d 100644 --- a/cmd/thv-operator/api/v1beta1/mcpgroup_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpgroup_types.go @@ -88,6 +88,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=mcpg;mcpgroup,categories=toolhive //+kubebuilder:printcolumn:name="Servers",type="integer",JSONPath=".status.serverCount" //+kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase" diff --git a/cmd/thv-operator/api/v1beta1/mcpoidcconfig_types.go b/cmd/thv-operator/api/v1beta1/mcpoidcconfig_types.go index 492e699087..35e597bac8 100644 --- a/cmd/thv-operator/api/v1beta1/mcpoidcconfig_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpoidcconfig_types.go @@ -200,6 +200,7 @@ type MCPOIDCConfigStatus struct { // +kubebuilder:object:root=true // +kubebuilder:storageversion // +kubebuilder:subresource:status +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true // +kubebuilder:resource:shortName=mcpoidc,categories=toolhive // +kubebuilder:printcolumn:name="Source",type=string,JSONPath=`.spec.type` // +kubebuilder:printcolumn:name="Valid",type=string,JSONPath=`.status.conditions[?(@.type=='Valid')].status` diff --git a/cmd/thv-operator/api/v1beta1/mcpregistry_types.go b/cmd/thv-operator/api/v1beta1/mcpregistry_types.go index fd4e8d01aa..0846773e72 100644 --- a/cmd/thv-operator/api/v1beta1/mcpregistry_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpregistry_types.go @@ -205,6 +205,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" //+kubebuilder:printcolumn:name="Replicas",type="integer",JSONPath=".status.readyReplicas" diff --git a/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go b/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go index 3b38d591a7..2105719152 100644 --- a/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go @@ -348,6 +348,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=rp;mcprp,categories=toolhive //+kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Remote URL",type="string",JSONPath=".spec.remoteUrl" diff --git a/cmd/thv-operator/api/v1beta1/mcpserver_types.go b/cmd/thv-operator/api/v1beta1/mcpserver_types.go index 6eaa4dcca6..1253e95e63 100644 --- a/cmd/thv-operator/api/v1beta1/mcpserver_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpserver_types.go @@ -884,6 +884,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=mcpserver;mcpservers,categories=toolhive //+kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" diff --git a/cmd/thv-operator/api/v1beta1/mcpserverentry_types.go b/cmd/thv-operator/api/v1beta1/mcpserverentry_types.go index 04c989230b..17d18ecd4d 100644 --- a/cmd/thv-operator/api/v1beta1/mcpserverentry_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpserverentry_types.go @@ -164,6 +164,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=mcpentry,categories=toolhive //+kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Transport",type="string",JSONPath=".spec.transport" diff --git a/cmd/thv-operator/api/v1beta1/mcptelemetryconfig_types.go b/cmd/thv-operator/api/v1beta1/mcptelemetryconfig_types.go index 7e1685227c..c35eb73de2 100644 --- a/cmd/thv-operator/api/v1beta1/mcptelemetryconfig_types.go +++ b/cmd/thv-operator/api/v1beta1/mcptelemetryconfig_types.go @@ -139,6 +139,7 @@ type MCPTelemetryConfigStatus struct { // +kubebuilder:object:root=true // +kubebuilder:storageversion // +kubebuilder:subresource:status +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true // +kubebuilder:resource:shortName=mcpotel,categories=toolhive // +kubebuilder:printcolumn:name="Endpoint",type=string,JSONPath=`.spec.openTelemetry.endpoint` // +kubebuilder:printcolumn:name="Valid",type=string,JSONPath=`.status.conditions[?(@.type=='Valid')].status` diff --git a/cmd/thv-operator/api/v1beta1/toolconfig_types.go b/cmd/thv-operator/api/v1beta1/toolconfig_types.go index 96781260cc..b28b37030a 100644 --- a/cmd/thv-operator/api/v1beta1/toolconfig_types.go +++ b/cmd/thv-operator/api/v1beta1/toolconfig_types.go @@ -112,6 +112,7 @@ type MCPToolConfigStatus struct { // +kubebuilder:object:root=true // +kubebuilder:storageversion // +kubebuilder:subresource:status +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true // +kubebuilder:resource:shortName=tc;toolconfig,categories=toolhive // +kubebuilder:printcolumn:name="Valid",type=string,JSONPath=`.status.conditions[?(@.type=='Valid')].status` // +kubebuilder:printcolumn:name="References",type=integer,JSONPath=`.status.referenceCount` diff --git a/cmd/thv-operator/api/v1beta1/virtualmcpcompositetooldefinition_types.go b/cmd/thv-operator/api/v1beta1/virtualmcpcompositetooldefinition_types.go index e2a4ec0da6..7f09e7ebeb 100644 --- a/cmd/thv-operator/api/v1beta1/virtualmcpcompositetooldefinition_types.go +++ b/cmd/thv-operator/api/v1beta1/virtualmcpcompositetooldefinition_types.go @@ -100,6 +100,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=vmcpctd;compositetool,categories=toolhive //+kubebuilder:printcolumn:name="Workflow",type="string",JSONPath=".spec.name",description="Workflow name" //+kubebuilder:printcolumn:name="Steps",type="integer",JSONPath=".spec.steps[*]",description="Number of steps" diff --git a/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go b/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go index d4f31b5d66..1daee7cde4 100644 --- a/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go +++ b/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go @@ -464,6 +464,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=vmcp;virtualmcp,categories=toolhive //+kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase",description="The phase of the VirtualMCPServer" //+kubebuilder:printcolumn:name="URL",type="string",JSONPath=".status.url",description="Virtual MCP server URL" diff --git a/cmd/thv-operator/app/app.go b/cmd/thv-operator/app/app.go index 4bfb1b52fc..ae4402d762 100644 --- a/cmd/thv-operator/app/app.go +++ b/cmd/thv-operator/app/app.go @@ -12,9 +12,11 @@ import ( "fmt" "log/slog" "os" + "strconv" "strings" "github.com/go-logr/logr" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -42,10 +44,20 @@ var ( setupLog = log.Log.WithName("setup") ) +// envEnableStorageVersionMigrator is the operator-wide opt-out for the +// StorageVersionMigrator controller. Unlike the other controllers — which are +// always-on after the operator-wide feature flag system was removed — the +// migrator retains a feature flag because admins running an external +// kube-storage-version-migrator deployment need a way to disable the in-operator +// controller. Setting this env var to a falsey value ("false", "0", "f") +// skips registering the migrator with the manager. +const envEnableStorageVersionMigrator = "ENABLE_STORAGE_VERSION_MIGRATOR" + func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(mcpv1alpha1.AddToScheme(scheme)) utilruntime.Must(mcpv1beta1.AddToScheme(scheme)) + utilruntime.Must(apiextensionsv1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } @@ -150,10 +162,53 @@ func setupControllersAndWebhooks(mgr ctrl.Manager, imagePullSecretsDefaults imag if err := setupAggregationControllers(mgr, imagePullSecretsDefaults); err != nil { return err } + if isStorageVersionMigratorEnabled() { + if err := setupStorageVersionMigrator(mgr); err != nil { + return err + } + } else { + setupLog.Info(envEnableStorageVersionMigrator + " is disabled, skipping StorageVersionMigrator controller") + } //+kubebuilder:scaffold:builder return nil } +// setupStorageVersionMigrator wires the StorageVersionMigrator controller into +// the manager. The controller reconciles status.storedVersions on opted-in +// toolhive.stacklok.dev CRDs so a future operator release can drop deprecated +// versions from spec.versions without orphaning etcd objects. +func setupStorageVersionMigrator(mgr ctrl.Manager) error { + if err := (&controllers.StorageVersionMigratorReconciler{ + Client: mgr.GetClient(), + APIReader: mgr.GetAPIReader(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("storageversionmigrator-controller"), + }).SetupWithManager(mgr); err != nil { + return fmt.Errorf("unable to create controller StorageVersionMigrator: %w", err) + } + return nil +} + +// isStorageVersionMigratorEnabled reports whether the StorageVersionMigrator +// controller should be registered. Defaults to true; can be disabled by setting +// ENABLE_STORAGE_VERSION_MIGRATOR to a falsey value. An unparseable value logs +// a warning and uses the default. +func isStorageVersionMigratorEnabled() bool { + value, found := os.LookupEnv(envEnableStorageVersionMigrator) + if !found { + return true + } + enabled, err := strconv.ParseBool(value) + if err != nil { + setupLog.Info( + "invalid boolean value for "+envEnableStorageVersionMigrator+", using default (enabled)", + "value", value, + ) + return true + } + return enabled +} + // setupGroupRefFieldIndexes sets up field indexing for spec.groupRef on all resource types // that can reference an MCPGroup. This enables efficient lookups by groupRef in controllers. func setupGroupRefFieldIndexes(mgr ctrl.Manager) error { diff --git a/cmd/thv-operator/controllers/marker_coverage_test.go b/cmd/thv-operator/controllers/marker_coverage_test.go new file mode 100644 index 0000000000..bce7f70741 --- /dev/null +++ b/cmd/thv-operator/controllers/marker_coverage_test.go @@ -0,0 +1,151 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package controllers + +import ( + "bufio" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestV1beta1TypesMarkerCoverage guards against a subtle failure mode in the +// opt-in-label design for storage-version migration: if a new CRD root type is +// added to cmd/thv-operator/api/v1beta1/ without the +// +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true +// +// marker, the StorageVersionMigrator controller silently excludes it from +// reconciliation. The problem surfaces only when a future release tries to +// drop a deprecated version — at which point it is far too late. +// +// The test scans every root type (marker block contains +// +kubebuilder:object:root=true AND +kubebuilder:storageversion) and fails if +// any lacks either the migrate marker or an explicit +// +thv:storage-version-migrator:exclude sibling marker. List types +// (+kubebuilder:object:root=true WITHOUT +kubebuilder:storageversion) are +// intentionally skipped — CRD-level labels are keyed on the root type, not +// the list type. +func TestV1beta1TypesMarkerCoverage(t *testing.T) { + t.Parallel() + + typesDir := filepath.Join("..", "api", "v1beta1") + entries, err := os.ReadDir(typesDir) + require.NoError(t, err, "reading %s", typesDir) + + const ( + rootMarker = "+kubebuilder:object:root=true" + storageMarker = "+kubebuilder:storageversion" + migrateMarker = "+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true" + excludeMarker = "+thv:storage-version-migrator:exclude" + groupversionInfoGo = "groupversion_info.go" + zzGeneratedPrefix = "zz_generated" + suffixTypesGo = "_types.go" + ) + + type rootType struct { + file string + typeName string + markerBlock []string + } + + var roots []rootType + + for _, e := range entries { + if e.IsDir() { + continue + } + name := e.Name() + if !strings.HasSuffix(name, suffixTypesGo) { + continue + } + if name == groupversionInfoGo || strings.HasPrefix(name, zzGeneratedPrefix) { + continue + } + + path := filepath.Join(typesDir, name) + f, err := os.Open(path) + require.NoError(t, err, "open %s", path) + + scanner := bufio.NewScanner(f) + // Raise the max token size — some of the *_types.go files have very + // long comment lines (printcolumn JSONPath expressions). + scanner.Buffer(make([]byte, 64*1024), 1024*1024) + + var block []string + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + + switch { + case strings.HasPrefix(line, "//"): + // Accumulate every comment/marker line. kubebuilder convention + // often separates markers from the godoc comment with a blank + // line, so we keep the block alive across blanks and only + // reset on a non-comment, non-blank, non-type line. + block = append(block, strings.TrimSpace(strings.TrimPrefix(line, "//"))) + case strings.HasPrefix(line, "type "): + // Only root-level types matter (must contain object:root=true + // AND storageversion markers). List types have only object:root=true. + if containsMarker(block, rootMarker) && containsMarker(block, storageMarker) { + typeName := extractTypeName(line) + if typeName != "" { + copied := append([]string(nil), block...) + roots = append(roots, rootType{file: name, typeName: typeName, markerBlock: copied}) + } + } + block = nil + case line == "": + // Blank line — keep block alive (comment-then-blank-then-type + // is idiomatic Go + kubebuilder). + default: + // Anything else (e.g. struct body, package clause, import) — + // drop any in-flight comment block. + block = nil + } + } + require.NoError(t, scanner.Err(), "scan %s", path) + require.NoError(t, f.Close()) + } + + require.NotEmpty(t, roots, + "no v1beta1 root types found — scanner likely broken; this test is meaningless without coverage") + + for _, r := range roots { + hasMigrate := containsMarker(r.markerBlock, migrateMarker) + hasExclude := containsMarker(r.markerBlock, excludeMarker) + assert.Truef(t, hasMigrate || hasExclude, + "v1beta1 root type %s.%s is missing either\n"+ + " %s\n"+ + "(opt in to storage-version migration) or\n"+ + " %s\n"+ + "(explicit opt-out). Every root type must declare one. See\n"+ + "cmd/thv-operator/controllers/storageversionmigrator_controller.go for context.", + r.file, r.typeName, migrateMarker, excludeMarker) + } +} + +// containsMarker returns true if any line in block contains the given +// marker substring. +func containsMarker(block []string, marker string) bool { + for _, l := range block { + if strings.Contains(l, marker) { + return true + } + } + return false +} + +// extractTypeName returns the identifier in a line of the form `type Foo struct {`. +// Returns empty string if the line is not a type declaration we care about. +func extractTypeName(line string) string { + fields := strings.Fields(line) + if len(fields) < 2 || fields[0] != "type" { + return "" + } + return fields[1] +} diff --git a/cmd/thv-operator/controllers/storageversionmigrator_controller.go b/cmd/thv-operator/controllers/storageversionmigrator_controller.go new file mode 100644 index 0000000000..10c147c6b3 --- /dev/null +++ b/cmd/thv-operator/controllers/storageversionmigrator_controller.go @@ -0,0 +1,481 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package controllers + +import ( + "context" + "errors" + "fmt" + "strings" + "sync" + "time" + + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + apitypes "k8s.io/apimachinery/pkg/types" + kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" +) + +// Public contract for the StorageVersionMigrator controller. + +// AutoMigrateLabel identifies CRDs that opt in to storage-version migration. +// Applied via a kubebuilder marker on each root type in api/v1beta1/. +const AutoMigrateLabel = "toolhive.stacklok.dev/auto-migrate-storage-version" + +// AutoMigrateValue is the label value that enables migration for a CRD. +const AutoMigrateValue = "true" + +// ToolhiveGroup is the API group the controller is scoped to (belt-and-braces +// filter in addition to the opt-in label). +const ToolhiveGroup = "toolhive.stacklok.dev" + +// EventReasonMigrationSucceeded and EventReasonMigrationFailed are the event +// reasons emitted on the owning CRD when a migration completes or fails. +const ( + EventReasonMigrationSucceeded = "StorageVersionMigrationSucceeded" + EventReasonMigrationFailed = "StorageVersionMigrationFailed" +) + +const ( + defaultMigrationCacheTTL = 1 * time.Hour + defaultListPageSize = 500 + defaultCacheGCInterval = 10 * time.Minute +) + +// errMigrationRetriedDueToConflicts is returned by restoreCRs when at least one +// CR re-store hit a typed Conflict (and no other errors occurred). The caller +// must NOT trim CRD.status.storedVersions in this case: the post-conflict state +// of the affected object is unverified, so reasoning about whether the storage +// re-encode actually happened is unsafe. The next reconcile retries cleanly. +var errMigrationRetriedDueToConflicts = errors.New( + "storage version migration retried due to concurrent writes; storedVersions left unchanged") + +// The wildcard CR RBAC below is intentional. The set of opted-in CRDs isn't +// known at codegen time — it's a per-CRD runtime label decision — so the +// kubebuilder marker can't enumerate kinds. The runtime gate is the +// isManagedCRD check inside Reconcile, which requires both the toolhive +// API group AND the opt-in label. Wildcard RBAC plus isManagedCRD form the +// defence in depth: RBAC bounds the controller to a single API group, and +// the label gate further restricts it to opted-in CRDs. + +//+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=get;list;watch +//+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions/status,verbs=update;patch +//+kubebuilder:rbac:groups=toolhive.stacklok.dev,resources=*,verbs=get;list;update +//+kubebuilder:rbac:groups=toolhive.stacklok.dev,resources=*/status,verbs=update + +// StorageVersionMigratorReconciler reconciles CustomResourceDefinition objects +// in the toolhive.stacklok.dev group that carry the opt-in +// AutoMigrateLabel=AutoMigrateValue. For each such CRD it re-stores every CR +// at the current storage version by doing a Get + Update on the live object. +// The Update is a full PUT of the unmodified object; the apiserver re-encodes +// the request body at the current storage version, then compares the +// resulting bytes to what's in etcd. When the CR was originally stored at a +// different version (the actual migration scenario) the bytes carry a +// different apiVersion stamp than etcd's record, the comparison fails, and +// the write proceeds — re-encoding the object at the current storage +// version. When the CR is already at the current storage version, the bytes +// match and the apiserver harmlessly elides the write — there was nothing to +// migrate. After all CRs have been processed it patches +// CRD.status.storedVersions down to [] so a future +// release can drop deprecated versions from spec.versions without orphaning +// etcd objects. See https://github.com/kubernetes-sigs/kube-storage-version-migrator/issues/65 +// for the upstream maintainers' explanation of this mechanism. +// +// Enabled by default. Opt out operator-wide via +// operator.features.storageVersionMigrator (ENABLE_STORAGE_VERSION_MIGRATOR=false) +// for admins who prefer to run kube-storage-version-migrator externally. +// Per-kind escape hatch: remove the label from the CRD (emergency only — will +// be re-applied by GitOps / helm upgrade). +type StorageVersionMigratorReconciler struct { + // used for CR Update writes and the CRD /status storedVersions patch; + // reads go through APIReader to bypass the informer cache. + client.Client + APIReader client.Reader // live reads for CRDs and CR list pages (bypasses informer) + Scheme *runtime.Scheme // kubebuilder reconciler convention + Recorder record.EventRecorder // MigrationSucceeded / MigrationFailed events on the CRD + PageSize int64 // overrideable for tests; defaults to defaultListPageSize + CacheGCInterval time.Duration // overrideable for tests; defaults to defaultCacheGCInterval + cache *migrationCache +} + +// Reconcile runs for each opted-in toolhive.stacklok.dev CRD event. See the +// package-level docs on StorageVersionMigratorReconciler for the full flow. +// Returns a non-nil error to trigger exponential backoff; the CRD watch +// re-enqueues on any status change, so explicit requeue intervals are not used. +func (r *StorageVersionMigratorReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx).WithValues("crd", req.Name) + + r.ensureInitialized() + + // Live-read the CRD. Informer cache may lag label or storedVersions updates. + crd := &apiextensionsv1.CustomResourceDefinition{} + if err := r.APIReader.Get(ctx, req.NamespacedName, crd); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("get CRD %s: %w", req.Name, err) + } + + // Re-verify the filter against live state; watch predicate could have + // fired on stale informer data. + if !isManagedCRD(crd) { + return ctrl.Result{}, nil + } + + storageVersion, ok := findStorageVersion(crd) + if !ok { + // CRDs without a storage version are malformed from our perspective; + // log and skip rather than fail (the API server would have rejected + // a CRD without a storage version, so this is unreachable in practice). + logger.Info("CRD has no storage version, skipping", "spec.versions", crd.Spec.Versions) + return ctrl.Result{}, nil + } + + if !isMigrationNeeded(crd, storageVersion) { + return ctrl.Result{}, nil + } + + logger.Info("migrating storage versions", + "storageVersion", storageVersion, + "storedVersions", crd.Status.StoredVersions, + ) + + if err := r.restoreCRs(ctx, crd, storageVersion); err != nil { + r.Recorder.Eventf(crd, "Warning", EventReasonMigrationFailed, + "storage version migration failed: %v", err) + return ctrl.Result{}, fmt.Errorf("re-store CRs for %s: %w", crd.Name, err) + } + + if err := r.patchStoredVersions(ctx, crd, storageVersion); err != nil { + r.Recorder.Eventf(crd, "Warning", EventReasonMigrationFailed, + "storedVersions patch failed: %v", err) + return ctrl.Result{}, fmt.Errorf("patch storedVersions for %s: %w", crd.Name, err) + } + + r.Recorder.Eventf(crd, "Normal", EventReasonMigrationSucceeded, + "storage version migrated to %s", storageVersion) + logger.Info("storage version migration complete", "storageVersion", storageVersion) + return ctrl.Result{}, nil +} + +// SetupWithManager wires the reconciler to watch CRDs using PartialObjectMetadata +// (no full-object cache), filtered on the opt-in label and the toolhive.stacklok.dev +// group. The filter is evaluated twice — once on informer events here, and again +// inside Reconcile after the live APIReader read — because label removals can +// briefly race the informer. +// +// It also registers a Runnable that periodically sweeps expired entries from +// the migration cache so deleted CRs (whose UIDs never recur in subsequent +// list pages and therefore never trigger lazy eviction in has()) don't grow +// the map without bound on long-running operators with high CR churn. +func (r *StorageVersionMigratorReconciler) SetupWithManager(mgr ctrl.Manager) error { + r.ensureInitialized() + + labelSelector, err := labels.Parse(AutoMigrateLabel + "=" + AutoMigrateValue) + if err != nil { + return fmt.Errorf("parse label selector: %w", err) + } + + if err := ctrl.NewControllerManagedBy(mgr). + Named("storageversionmigrator"). + For( + &apiextensionsv1.CustomResourceDefinition{}, + builder.OnlyMetadata, + builder.WithPredicates( + predicate.NewPredicateFuncs(func(obj client.Object) bool { + return labelSelector.Matches(labels.Set(obj.GetLabels())) && + isToolhiveCRDName(obj.GetName()) + }), + predicate.ResourceVersionChangedPredicate{}, + ), + ). + Complete(r); err != nil { + return err + } + + // Periodic cache GC. Registered after Complete so the controller is fully + // wired when the runnable starts. + return mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { + ticker := time.NewTicker(r.CacheGCInterval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + r.cache.gc() + } + } + })) +} + +// ------------------------------------------------------------------ +// Private implementation below. +// ------------------------------------------------------------------ + +func (r *StorageVersionMigratorReconciler) ensureInitialized() { + if r.PageSize == 0 { + r.PageSize = defaultListPageSize + } + if r.CacheGCInterval == 0 { + r.CacheGCInterval = defaultCacheGCInterval + } + if r.cache == nil { + r.cache = newMigrationCache(defaultMigrationCacheTTL) + } +} + +// restoreCRs lists all CRs of the CRD's served kind (served version = storageVersion) +// and issues a main-resource Update on each one, forcing the apiserver to +// re-encode the object at the current storage version. +// +// Per-CR error handling: +// - IsNotFound: silently skipped (object deleted between list and update — +// it can't be at the old storage version anymore). +// - IsConflict: silently skipped at the per-CR level, but a function-level +// counter is incremented. After the loop, if any conflicts occurred and no +// other errors did, errMigrationRetriedDueToConflicts is returned so the +// caller leaves storedVersions untouched (the post-conflict state of the +// conflicting object is unverified). +// - All other errors are aggregated and returned. +func (r *StorageVersionMigratorReconciler) restoreCRs( + ctx context.Context, + crd *apiextensionsv1.CustomResourceDefinition, + storageVersion string, +) error { + logger := log.FromContext(ctx) + gvk := schema.GroupVersionKind{ + Group: crd.Spec.Group, + Version: storageVersion, + Kind: crd.Spec.Names.Kind, + } + + listGVK := gvk + listGVK.Kind = crd.Spec.Names.ListKind + + var errs []error + conflicts := 0 + var continueToken string + for { + list := &unstructured.UnstructuredList{} + list.SetGroupVersionKind(listGVK) + listOpts := []client.ListOption{client.Limit(r.PageSize)} + if continueToken != "" { + listOpts = append(listOpts, client.Continue(continueToken)) + } + if err := r.APIReader.List(ctx, list, listOpts...); err != nil { + return fmt.Errorf("list %s: %w", listGVK.String(), err) + } + + if err := meta.EachListItem(list, func(obj runtime.Object) error { + u, ok := obj.(*unstructured.Unstructured) + if !ok { + errs = append(errs, fmt.Errorf("unexpected list item type %T", obj)) + return nil + } + if r.cache.has(crd.Name, u.GetUID(), u.GetResourceVersion()) { + return nil + } + restored, err := r.restoreOne(ctx, gvk, u) + if err != nil { + switch { + case apierrors.IsNotFound(err): + logger.V(1).Info("skip CR — deleted", + "object", client.ObjectKeyFromObject(u), "err", err) + case apierrors.IsConflict(err): + conflicts++ + logger.V(1).Info("skip CR — concurrent write conflict", + "object", client.ObjectKeyFromObject(u), "err", err) + default: + errs = append(errs, fmt.Errorf("re-store %s/%s: %w", + u.GetNamespace(), u.GetName(), err)) + } + return nil + } + r.cache.add(crd.Name, restored.GetUID(), restored.GetResourceVersion()) + return nil + }); err != nil { + errs = append(errs, err) + } + + continueToken = list.GetContinue() + if continueToken == "" { + break + } + } + + if len(errs) == 0 && conflicts > 0 { + return errMigrationRetriedDueToConflicts + } + return kerrors.NewAggregate(errs) +} + +// restoreOne issues a plain Get + Update on the live CR. The apiserver +// re-encodes the request body at the current storage version and compares +// it to etcd's record; when the CR was originally stored at a different +// apiVersion the bytes differ, the write proceeds, and etcd is re-encoded +// at the current storage version. When the CR is already at the current +// storage version the bytes match and the apiserver harmlessly elides the +// write — there was nothing to migrate. The Update goes through the main +// resource, so validating/mutating admission webhooks on the kind see this +// request as part of normal admission flow; only requests that actually +// persist produce downstream state changes. Returns the live object after +// the update so the caller can record its post-update resourceVersion in +// the cache. +func (r *StorageVersionMigratorReconciler) restoreOne( + ctx context.Context, + gvk schema.GroupVersionKind, + original *unstructured.Unstructured, +) (*unstructured.Unstructured, error) { + live := &unstructured.Unstructured{} + live.SetGroupVersionKind(gvk) + if err := r.APIReader.Get(ctx, client.ObjectKeyFromObject(original), live); err != nil { + // IsNotFound is propagated to the caller, which handles it. + return nil, err + } + if err := r.Update(ctx, live); err != nil { + return nil, err + } + return live, nil +} + +// patchStoredVersions overwrites CRD.status.storedVersions to exactly +// [storageVersion], using an optimistic lock on the CRD's resourceVersion so +// a concurrent API-server write rejects the patch and triggers a requeue. +func (r *StorageVersionMigratorReconciler) patchStoredVersions( + ctx context.Context, + crd *apiextensionsv1.CustomResourceDefinition, + storageVersion string, +) error { + original := crd.DeepCopy() + crd.Status.StoredVersions = []string{storageVersion} + return r.Client.Status().Patch(ctx, crd, + client.MergeFromWithOptions(original, client.MergeFromWithOptimisticLock{})) +} + +// isManagedCRD returns true if a CRD is opted in to migration: the group matches +// toolhive.stacklok.dev and the opt-in label is set to the expected value. +func isManagedCRD(crd *apiextensionsv1.CustomResourceDefinition) bool { + if crd.Spec.Group != ToolhiveGroup { + return false + } + return crd.GetLabels()[AutoMigrateLabel] == AutoMigrateValue +} + +// isToolhiveCRDName checks whether a CRD name is of the form .toolhive.stacklok.dev, +// which is sufficient to filter at watch time. Reconcile re-verifies via the live CRD. +func isToolhiveCRDName(name string) bool { + return strings.HasSuffix(name, "."+ToolhiveGroup) +} + +// findStorageVersion returns the single version marked storage=true in the CRD spec. +func findStorageVersion(crd *apiextensionsv1.CustomResourceDefinition) (string, bool) { + for _, v := range crd.Spec.Versions { + if v.Storage { + return v.Name, true + } + } + return "", false +} + +// isMigrationNeeded returns true iff status.storedVersions is anything other +// than exactly [storageVersion]. The set of served versions does not affect +// this check — under spec.conversion.strategy=None with identical schemas, +// normal writers cannot reintroduce stale versions to storedVersions, so a +// defensive re-scan based on servedCount has no scenario to defend against. +func isMigrationNeeded( + crd *apiextensionsv1.CustomResourceDefinition, + storageVersion string, +) bool { + stored := crd.Status.StoredVersions + return len(stored) != 1 || stored[0] != storageVersion +} + +// ------------------------------------------------------------------ +// migrationCache: short-lived de-duplication of re-store writes. +// ------------------------------------------------------------------ + +// migrationCache records successfully-migrated (UID, resourceVersion) pairs +// so subsequent reconciles within the TTL window skip already-fresh objects. +// It is a correctness optimization only — a cache miss simply issues a +// redundant (but harmless) Update. +// +// Eviction: lazy on lookup in has(), plus a periodic sweep via gc() driven +// from a manager.Runnable registered in SetupWithManager. The periodic sweep +// is required because lookups never recur for deleted CRs, so without it +// their entries would persist forever. +type migrationCache struct { + mu sync.Mutex + entries map[string]cacheEntry + ttl time.Duration + now func() time.Time +} + +type cacheEntry struct { + resourceVersion string + expiresAt time.Time +} + +func newMigrationCache(ttl time.Duration) *migrationCache { + return &migrationCache{ + entries: make(map[string]cacheEntry), + ttl: ttl, + now: time.Now, + } +} + +func (c *migrationCache) has(crdName string, uid apitypes.UID, resourceVersion string) bool { + key := c.key(crdName, uid) + c.mu.Lock() + defer c.mu.Unlock() + entry, ok := c.entries[key] + if !ok { + return false + } + if c.now().After(entry.expiresAt) { + delete(c.entries, key) + return false + } + return entry.resourceVersion == resourceVersion +} + +func (c *migrationCache) add(crdName string, uid apitypes.UID, resourceVersion string) { + key := c.key(crdName, uid) + c.mu.Lock() + defer c.mu.Unlock() + c.entries[key] = cacheEntry{ + resourceVersion: resourceVersion, + expiresAt: c.now().Add(c.ttl), + } +} + +// gc evicts every expired entry from the cache. Called from a periodic +// manager.Runnable so entries for deleted CRs (whose UIDs never recur in +// subsequent list pages) don't accumulate without bound. +func (c *migrationCache) gc() { + c.mu.Lock() + defer c.mu.Unlock() + now := c.now() + for k, e := range c.entries { + if now.After(e.expiresAt) { + delete(c.entries, k) + } + } +} + +func (*migrationCache) key(crdName string, uid apitypes.UID) string { + return crdName + "|" + string(uid) +} diff --git a/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go b/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go new file mode 100644 index 0000000000..35c318a27d --- /dev/null +++ b/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go @@ -0,0 +1,712 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package storageversionmigrator + +import ( + "context" + "fmt" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/stacklok/toolhive/cmd/thv-operator/controllers" +) + +const ( + toolhiveGroup = controllers.ToolhiveGroup + migrateLabel = controllers.AutoMigrateLabel + migrateValue = controllers.AutoMigrateValue +) + +// crdSpec describes a test CRD fixture. +type crdSpec struct { + Name string + Group string + Kind string + ListKind string + Plural string + Singular string + Versions []versionSpec + Labelled bool + HasStatusOnStored bool +} + +type versionSpec struct { + Name string + Served bool + Storage bool +} + +func buildCRD(s crdSpec) *apiextensionsv1.CustomResourceDefinition { + versions := make([]apiextensionsv1.CustomResourceDefinitionVersion, 0, len(s.Versions)) + for _, v := range s.Versions { + cdv := apiextensionsv1.CustomResourceDefinitionVersion{ + Name: v.Name, + Served: v.Served, + Storage: v.Storage, + Schema: &apiextensionsv1.CustomResourceValidation{ + OpenAPIV3Schema: &apiextensionsv1.JSONSchemaProps{ + Type: "object", + Properties: map[string]apiextensionsv1.JSONSchemaProps{ + "spec": { + Type: "object", + XPreserveUnknownFields: ptrBool(true), + }, + "status": { + Type: "object", + XPreserveUnknownFields: ptrBool(true), + }, + }, + }, + }, + } + if v.Storage && s.HasStatusOnStored { + cdv.Subresources = &apiextensionsv1.CustomResourceSubresources{ + Status: &apiextensionsv1.CustomResourceSubresourceStatus{}, + } + } + versions = append(versions, cdv) + } + + crd := &apiextensionsv1.CustomResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{Name: s.Name}, + Spec: apiextensionsv1.CustomResourceDefinitionSpec{ + Group: s.Group, + Names: apiextensionsv1.CustomResourceDefinitionNames{ + Kind: s.Kind, + ListKind: s.ListKind, + Plural: s.Plural, + Singular: s.Singular, + }, + Scope: apiextensionsv1.NamespaceScoped, + Versions: versions, + }, + } + if s.Labelled { + crd.Labels = map[string]string{migrateLabel: migrateValue} + } + return crd +} + +func ptrBool(b bool) *bool { return &b } + +// installCRD creates a CRD and waits for the apiserver to publish it so +// unstructured CR creates of that kind will succeed. +func installCRD(c crdSpec) { + crd := buildCRD(c) + Expect(k8sClient.Create(ctx, crd)).To(Succeed()) + + Eventually(func() bool { + live := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: c.Name}, live); err != nil { + return false + } + for _, cond := range live.Status.Conditions { + if cond.Type == apiextensionsv1.Established && cond.Status == apiextensionsv1.ConditionTrue { + return true + } + } + return false + }, time.Second*10, time.Millisecond*200).Should(BeTrue(), "CRD %s never became Established", c.Name) +} + +func deleteCRD(name string) { + crd := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: name}, crd); err != nil { + if apierrors.IsNotFound(err) { + return + } + Fail(fmt.Sprintf("get CRD %s before delete: %v", name, err)) + } + Expect(k8sClient.Delete(ctx, crd)).To(Succeed()) + Eventually(func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, types.NamespacedName{Name: name}, &apiextensionsv1.CustomResourceDefinition{})) + }, time.Second*30, time.Millisecond*200).Should(BeTrue(), "CRD %s never fully deleted", name) +} + +// setStoredVersions overwrites status.storedVersions, simulating a historical +// state where objects were stored at earlier versions. +func setStoredVersions(crdName string, versions []string) { + Eventually(func() error { + crd := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, crd); err != nil { + return err + } + orig := crd.DeepCopy() + crd.Status.StoredVersions = versions + return k8sClient.Status().Patch(ctx, crd, client.MergeFrom(orig)) + }, time.Second*5, time.Millisecond*100).Should(Succeed()) +} + +func getStoredVersions(crdName string) []string { + crd := &apiextensionsv1.CustomResourceDefinition{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, crd)).To(Succeed()) + return append([]string{}, crd.Status.StoredVersions...) +} + +// createCRs creates count CRs in the default namespace with the given kind +// and a name derived from basename. Returns the created objects so tests can +// assert on them post-reconcile. +func createCRs(gvk schema.GroupVersionKind, basename string, count int) []*unstructured.Unstructured { + out := make([]*unstructured.Unstructured, 0, count) + for i := 0; i < count; i++ { + u := &unstructured.Unstructured{} + u.SetGroupVersionKind(gvk) + u.SetNamespace("default") + u.SetName(fmt.Sprintf("%s-%d", basename, i)) + Expect(unstructured.SetNestedField(u.Object, "placeholder", "spec", "marker")).To(Succeed()) + Expect(k8sClient.Create(ctx, u)).To(Succeed()) + out = append(out, u) + } + return out +} + +// Note on "did the re-store actually fire" verification: +// +// The controller does a plain Get + Update on each CR. When the CR is already +// at the current storage version the apiserver freshly re-encodes the request +// body, sees it matches etcd byte-for-byte, and elides the write — that's +// correct behaviour, not a controller bug, and it means the per-CR RV does +// not bump for an already-clean CR. The dedicated cross-version test +// ("re-encodes CRs that are stored at a prior storage version") proves the +// migration mechanism actually works for objects stored at older versions: +// it stores a CR at v1alpha1, flips storage to v1beta1, and asserts the CR's +// RV bumps after reconcile. +// +// The pagination test additionally verifies the continue-token loop via a +// list-call counter, and the partial-failure test asserts storedVersions is +// not trimmed when any CR re-store fails. + +// newReconciler constructs a StorageVersionMigratorReconciler for a single +// test. Every test has its own instance so the migration cache doesn't leak +// between tests and state is fully explicit. +func newReconciler() *controllers.StorageVersionMigratorReconciler { + return &controllers.StorageVersionMigratorReconciler{ + Client: k8sClient, + APIReader: k8sClient, + Scheme: k8sClient.Scheme(), + Recorder: &noopRecorder{}, + } +} + +// reconcile invokes the reconciler once for the given CRD and returns the +// result and error directly — tests assert on both. +func reconcile(r *controllers.StorageVersionMigratorReconciler, crdName string) (ctrl.Result, error) { + return r.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Name: crdName}}) +} + +var crdCounter int + +func uniqueSuffix() string { + crdCounter++ + return fmt.Sprintf("t%d", crdCounter) +} + +// ------------------------------------------------------------------ +// Tests +// ------------------------------------------------------------------ + +var _ = Describe("StorageVersionMigrator", func() { + Describe("Reconcile", func() { + + It("is a noop when storedVersions is already [storageVersion] and only one version is served", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "noops" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Noop" + suf, + ListKind: "Noop" + suf + "List", + Plural: "noops" + suf, + Singular: "noop" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + // envtest leaves storedVersions empty until a write happens. + // Seed it explicitly so the isMigrationNeeded check sees the + // "clean" state we want to exercise. + setStoredVersions(spec.Name, []string{"v1beta1"}) + + _, err := reconcile(newReconciler(), spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) + }) + + It("migrates storedVersions from [v1alpha1,v1beta1] to [v1beta1]", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "happies" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Happy" + suf, + ListKind: "Happy" + suf + "List", + Plural: "happies" + suf, + Singular: "happy" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + // The CRs created here are already stored at v1beta1 (the + // current storage version), so the apiserver will elide each + // per-CR Update — its freshly-encoded body matches etcd + // byte-for-byte. That's correct apiserver behaviour, not a + // controller bug, so this test does NOT assert per-CR + // resourceVersion bumps. The cross-version test below proves + // the migration mechanism actually re-encodes objects that were + // stored at an older apiVersion. + createCRs( + schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, + "obj-"+suf, 3, + ) + + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + _, err := reconcile(newReconciler(), spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) + }) + + // Load-bearing proof of the migration mechanism: a CR stored at + // v1alpha1, after the storage version has flipped to v1beta1, must + // have its resourceVersion bumped by reconcile — that's the + // observable evidence the apiserver actually re-encoded the etcd + // document. See the upstream confirmation at + // https://github.com/kubernetes-sigs/kube-storage-version-migrator/issues/65. + It("re-encodes CRs that are stored at a prior storage version", func() { + suf := uniqueSuffix() + crdName := "crossvers" + suf + "." + toolhiveGroup + kind := "CrossVer" + suf + plural := "crossvers" + suf + + versionSchema := func() *apiextensionsv1.CustomResourceValidation { + return &apiextensionsv1.CustomResourceValidation{ + OpenAPIV3Schema: &apiextensionsv1.JSONSchemaProps{ + Type: "object", + Properties: map[string]apiextensionsv1.JSONSchemaProps{ + "spec": {Type: "object", XPreserveUnknownFields: ptrBool(true)}, + "status": {Type: "object", XPreserveUnknownFields: ptrBool(true)}, + }, + }, + } + } + + // Step 1: install CRD with v1alpha1 as the storage version so + // CRs created next are written to etcd as v1alpha1 bytes. + crd := &apiextensionsv1.CustomResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{ + Name: crdName, + Labels: map[string]string{migrateLabel: migrateValue}, + }, + Spec: apiextensionsv1.CustomResourceDefinitionSpec{ + Group: toolhiveGroup, + Names: apiextensionsv1.CustomResourceDefinitionNames{ + Kind: kind, + ListKind: kind + "List", + Plural: plural, + Singular: "crossver" + suf, + }, + Scope: apiextensionsv1.NamespaceScoped, + Versions: []apiextensionsv1.CustomResourceDefinitionVersion{ + {Name: "v1alpha1", Served: true, Storage: true, Schema: versionSchema()}, + {Name: "v1beta1", Served: true, Storage: false, Schema: versionSchema()}, + }, + }, + } + Expect(k8sClient.Create(ctx, crd)).To(Succeed()) + DeferCleanup(func() { deleteCRD(crdName) }) + + Eventually(func() bool { + live := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, live); err != nil { + return false + } + for _, c := range live.Status.Conditions { + if c.Type == apiextensionsv1.Established && c.Status == apiextensionsv1.ConditionTrue { + return true + } + } + return false + }, time.Second*10, time.Millisecond*200).Should(BeTrue()) + + // Step 2: create one CR — etcd writes apiVersion: v1alpha1 bytes. + cr := createCRs( + schema.GroupVersionKind{Group: toolhiveGroup, Version: "v1alpha1", Kind: kind}, + "obj-"+suf, 1, + )[0] + + // Step 3: flip storage to v1beta1. + Eventually(func() error { + live := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, live); err != nil { + return err + } + orig := live.DeepCopy() + for i := range live.Spec.Versions { + live.Spec.Versions[i].Storage = (live.Spec.Versions[i].Name == "v1beta1") + } + return k8sClient.Patch(ctx, live, client.MergeFrom(orig)) + }, time.Second*10, time.Millisecond*200).Should(Succeed()) + + // Confirm the storage flip settled before proceeding. + Eventually(func() bool { + live := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, live); err != nil { + return false + } + for _, v := range live.Spec.Versions { + if v.Name == "v1beta1" && v.Storage { + return true + } + } + return false + }, time.Second*10, time.Millisecond*200).Should(BeTrue()) + + // Step 4: storedVersions reflects the historical v1alpha1 entry. + setStoredVersions(crdName, []string{"v1alpha1", "v1beta1"}) + + // Step 5: snapshot RV before reconcile. + preLive := &unstructured.Unstructured{} + preLive.SetGroupVersionKind(schema.GroupVersionKind{ + Group: toolhiveGroup, Version: "v1beta1", Kind: kind, + }) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cr), preLive)).To(Succeed()) + preRV := preLive.GetResourceVersion() + + // Step 6: reconcile. + _, err := reconcile(newReconciler(), crdName) + Expect(err).NotTo(HaveOccurred()) + + // Step 7: storedVersions trimmed. + Expect(getStoredVersions(crdName)).To(Equal([]string{"v1beta1"})) + + // Step 8: empirical proof — RV bumped because the cross-version + // Update actually wrote etcd. + postLive := &unstructured.Unstructured{} + postLive.SetGroupVersionKind(preLive.GroupVersionKind()) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cr), postLive)).To(Succeed()) + Expect(postLive.GetResourceVersion()).NotTo(Equal(preRV), + "CR %s/%s resourceVersion did not bump (pre=%s post=%s) — cross-version re-store did not write to etcd", + cr.GetNamespace(), cr.GetName(), preRV, postLive.GetResourceVersion()) + }) + + It("skips CRDs in foreign API groups", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "outsiders" + suf + ".example.com", + Group: "example.com", + Kind: "Outsider" + suf, + ListKind: "Outsider" + suf + "List", + Plural: "outsiders" + suf, + Singular: "outsider" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + _, err := reconcile(newReconciler(), spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1alpha1", "v1beta1"}), + "storedVersions must be untouched for foreign-group CRDs") + }) + + It("skips toolhive CRDs missing the opt-in label", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "unlabelled" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Unlabelled" + suf, + ListKind: "Unlabelled" + suf + "List", + Plural: "unlabelled" + suf, + Singular: "unlabelled" + suf, + Labelled: false, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + _, err := reconcile(newReconciler(), spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1alpha1", "v1beta1"}), + "storedVersions must be untouched for unlabelled CRDs") + }) + + It("handles pagination across multiple list pages", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "paginated" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Paginated" + suf, + ListKind: "Paginated" + suf + "List", + Plural: "paginated" + suf, + Singular: "paginated" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + // Seven CRs with PageSize=3 forces three pages (3+3+1) and + // exercises the continue-token loop far more cheaply than 501 + // writes against envtest. + createCRs( + schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, + "obj-"+suf, 7, + ) + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + // Wrap APIReader to count List calls for this kind. This is the + // only direct proof that the continue-token loop actually ran — + // metadata-only SSAs don't leave a managedFields fingerprint. + counting := &countingAPIReader{Reader: k8sClient, kind: spec.Kind} + r := &controllers.StorageVersionMigratorReconciler{ + Client: k8sClient, + APIReader: counting, + Scheme: k8sClient.Scheme(), + Recorder: &noopRecorder{}, + PageSize: 3, + } + _, err := reconcile(r, spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) + + // 7 CRs with PageSize=3 ⇒ 3 list calls (pages of 3+3+1). + Expect(counting.listCalls).To(BeNumerically(">=", 3), + "pagination should have triggered at least 3 list calls for 7 CRs at pageSize=3; got %d", + counting.listCalls) + }) + + It("does not touch storedVersions when a CR re-store fails", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "failures" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Failure" + suf, + ListKind: "Failure" + suf + "List", + Plural: "failures" + suf, + Singular: "failure" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + crs := createCRs( + schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, + "obj-"+suf, 3, + ) + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + failureTarget := client.ObjectKeyFromObject(crs[0]) + failing := &failingUpdateClient{ + Client: k8sClient, + errFn: func(key client.ObjectKey) error { + if key == failureTarget { + return fmt.Errorf("injected update failure for %s", key) + } + return nil + }, + } + r := &controllers.StorageVersionMigratorReconciler{ + Client: failing, + APIReader: k8sClient, + Scheme: k8sClient.Scheme(), + Recorder: &noopRecorder{}, + } + _, err := reconcile(r, spec.Name) + Expect(err).To(HaveOccurred(), "reconcile should surface the injected failure") + Expect(err.Error()).To(ContainSubstring(failureTarget.Name)) + + // Critical contract: storedVersions must NOT be trimmed when any + // CR re-store failed. Otherwise the next release's v1alpha1 + // removal would orphan the un-migrated object in etcd. + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1alpha1", "v1beta1"})) + }) + + It("leaves storedVersions untouched when a CR re-store hits a Conflict, then trims on retry", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "conflicts" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Conflict" + suf, + ListKind: "Conflict" + suf + "List", + Plural: "conflicts" + suf, + Singular: "conflict" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + crs := createCRs( + schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, + "obj-"+suf, 2, + ) + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + conflictTarget := client.ObjectKeyFromObject(crs[0]) + injectConflict := true + gr := schema.GroupResource{Group: spec.Group, Resource: spec.Plural} + conflicting := &failingUpdateClient{ + Client: k8sClient, + errFn: func(key client.ObjectKey) error { + if injectConflict && key == conflictTarget { + return apierrors.NewConflict(gr, key.Name, + fmt.Errorf("injected conflict")) + } + return nil + }, + } + r := &controllers.StorageVersionMigratorReconciler{ + Client: conflicting, + APIReader: k8sClient, + Scheme: k8sClient.Scheme(), + Recorder: &noopRecorder{}, + } + + // First pass: conflict swallowed at the per-CR level, but the + // function-level conflict counter trips errMigrationRetriedDueToConflicts + // so storedVersions is left untouched. + _, err := reconcile(r, spec.Name) + Expect(err).To(HaveOccurred(), + "reconcile must return an error when a Conflict was swallowed") + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1alpha1", "v1beta1"}), + "storedVersions must not be trimmed on a pass with any swallowed Conflict") + + // Drop the injection and retry. The cache may have absorbed the + // non-conflicting CR's RV from the first pass — that's fine, the + // conflicting one was never recorded in the cache so it'll be + // re-attempted, succeed, and let the storedVersions patch fire. + injectConflict = false + _, err = reconcile(r, spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) + }) + }) +}) + +// ------------------------------------------------------------------ +// Test doubles +// ------------------------------------------------------------------ + +// failingUpdateClient wraps a real client.Client and intercepts Update (and +// Status().Update) for specific object keys. The controller's restoreOne goes +// through Update — so this wrapper is how we inject failures and conflicts. +// +// errFn returns the error to inject for a given key, or nil to let the call +// pass through to the wrapped client. Returning a non-nil error short-circuits +// the call so the underlying object is not modified. +type failingUpdateClient struct { + client.Client + errFn func(key client.ObjectKey) error +} + +func (f *failingUpdateClient) Update(ctx context.Context, obj client.Object, opts ...client.UpdateOption) error { + if err := f.errFn(client.ObjectKeyFromObject(obj)); err != nil { + return err + } + return f.Client.Update(ctx, obj, opts...) +} + +func (f *failingUpdateClient) Status() client.SubResourceWriter { + return &failingUpdateStatus{ + inner: f.Client.Status(), + errFn: f.errFn, + } +} + +type failingUpdateStatus struct { + inner client.SubResourceWriter + errFn func(key client.ObjectKey) error +} + +func (s *failingUpdateStatus) Create(ctx context.Context, obj client.Object, sub client.Object, opts ...client.SubResourceCreateOption) error { + return s.inner.Create(ctx, obj, sub, opts...) +} + +func (s *failingUpdateStatus) Update(ctx context.Context, obj client.Object, opts ...client.SubResourceUpdateOption) error { + if err := s.errFn(client.ObjectKeyFromObject(obj)); err != nil { + return err + } + return s.inner.Update(ctx, obj, opts...) +} + +func (s *failingUpdateStatus) Patch(ctx context.Context, obj client.Object, patch client.Patch, opts ...client.SubResourcePatchOption) error { + return s.inner.Patch(ctx, obj, patch, opts...) +} + +func (s *failingUpdateStatus) Apply(ctx context.Context, obj runtime.ApplyConfiguration, opts ...client.SubResourceApplyOption) error { + return s.inner.Apply(ctx, obj, opts...) +} + +// noopRecorder is a minimal EventRecorder for direct-Reconcile tests. +type noopRecorder struct{} + +func (*noopRecorder) Event(_ runtime.Object, _, _, _ string) {} +func (*noopRecorder) Eventf(_ runtime.Object, _, _, _ string, _ ...any) {} +func (*noopRecorder) AnnotatedEventf(_ runtime.Object, _ map[string]string, _, _, _ string, _ ...any) { +} + +// countingAPIReader wraps a client.Reader and records how many List calls +// targeted a given kind. Used by the pagination test to verify the +// continue-token loop ran as expected. +type countingAPIReader struct { + client.Reader + kind string + listCalls int +} + +func (c *countingAPIReader) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error { + if u, ok := list.(*unstructured.UnstructuredList); ok { + // ListKind is "List"; match on the configured kind prefix. + if u.GetKind() == c.kind+"List" { + c.listCalls++ + } + } + return c.Reader.List(ctx, list, opts...) +} diff --git a/cmd/thv-operator/test-integration/storageversionmigrator/suite_test.go b/cmd/thv-operator/test-integration/storageversionmigrator/suite_test.go new file mode 100644 index 0000000000..0e94586912 --- /dev/null +++ b/cmd/thv-operator/test-integration/storageversionmigrator/suite_test.go @@ -0,0 +1,83 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +// Package storageversionmigrator contains envtest-backed integration tests +// for the StorageVersionMigrator controller. +// +// The suite does NOT pre-install any CRDs — each test constructs and installs +// the exact CRD it needs, which keeps scenarios independent and lets us +// exercise edge cases (foreign groups, missing status subresource, etc.) that +// wouldn't be possible with the real toolhive CRD manifests. +// +// The suite also does NOT start a controller manager. Each test constructs its +// own reconciler and calls Reconcile directly. This is more deterministic than +// manager-driven tests (no Eventually() races against the background +// controller) and lets individual tests inject custom clients to exercise +// failure paths. +package storageversionmigrator + +import ( + "context" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "go.uber.org/zap/zapcore" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" +) + +var ( + cfg *rest.Config + k8sClient client.Client + testEnv *envtest.Environment + ctx context.Context + cancel context.CancelFunc +) + +func TestStorageVersionMigrator(t *testing.T) { + t.Parallel() + RegisterFailHandler(Fail) + + suiteConfig, reporterConfig := GinkgoConfiguration() + reporterConfig.Verbose = false + reporterConfig.VeryVerbose = false + reporterConfig.FullTrace = false + + RunSpecs(t, "StorageVersionMigrator Controller Integration Test Suite", suiteConfig, reporterConfig) +} + +var _ = BeforeSuite(func() { + logLevel := zapcore.ErrorLevel + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true), zap.Level(logLevel))) + + ctx, cancel = context.WithCancel(context.TODO()) + + By("bootstrapping envtest") + testEnv = &envtest.Environment{ + ErrorIfCRDPathMissing: false, // tests install CRDs on demand + } + + var err error + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + + utilruntime.Must(apiextensionsv1.AddToScheme(scheme.Scheme)) + + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) +}) + +var _ = AfterSuite(func() { + By("tearing down envtest") + cancel() + Expect(testEnv.Stop()).NotTo(HaveOccurred()) +}) diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_embeddingservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_embeddingservers.yaml index ab99253a70..cf1790b3a5 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_embeddingservers.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_embeddingservers.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: embeddingservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml index 52dcc4b8cd..6132d263c6 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpexternalauthconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpgroups.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpgroups.yaml index aa05cf8c17..8b6b3ef56a 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpgroups.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpgroups.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpgroups.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpoidcconfigs.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpoidcconfigs.yaml index 2c91fab5e5..907863642e 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpoidcconfigs.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpoidcconfigs.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpoidcconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpregistries.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpregistries.yaml index 190fcf6daf..03a55b7bb0 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpregistries.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpregistries.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpregistries.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml index 21e6ee87a7..9a6bbb3fba 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpremoteproxies.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpserverentries.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpserverentries.yaml index 78c2523eca..c7a9896551 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpserverentries.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpserverentries.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpserverentries.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml index 04179042e7..e598a019dc 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptelemetryconfigs.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptelemetryconfigs.yaml index 9a61add0cb..2262801a88 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptelemetryconfigs.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptelemetryconfigs.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcptelemetryconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptoolconfigs.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptoolconfigs.yaml index c9f2732d24..bdcfc3a3ca 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptoolconfigs.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptoolconfigs.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcptoolconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml index a9c17deff5..bd13d9ea55 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: virtualmcpcompositetooldefinitions.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml index 8a9a6e217a..2a5cd24a06 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: virtualmcpservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_embeddingservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_embeddingservers.yaml index 4e9df1d7ee..09b56976fa 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_embeddingservers.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_embeddingservers.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: embeddingservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml index 7242f0d237..9095c2ac19 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpexternalauthconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpgroups.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpgroups.yaml index 996575e4aa..9ace1332eb 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpgroups.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpgroups.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpgroups.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpoidcconfigs.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpoidcconfigs.yaml index 3c23c16b23..c38d20e046 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpoidcconfigs.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpoidcconfigs.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpoidcconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpregistries.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpregistries.yaml index 9deb807fb4..562e81320c 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpregistries.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpregistries.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpregistries.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml index b029787947..bca9545db6 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpremoteproxies.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpserverentries.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpserverentries.yaml index 6542d398ab..ec29903223 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpserverentries.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpserverentries.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpserverentries.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml index 573c6e83bf..ad733a56af 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptelemetryconfigs.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptelemetryconfigs.yaml index ac6ef5767b..3377fe0408 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptelemetryconfigs.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptelemetryconfigs.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcptelemetryconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptoolconfigs.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptoolconfigs.yaml index c7f5fbf2d3..d0b232ae59 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptoolconfigs.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptoolconfigs.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcptoolconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml index 39fcdea5fc..d90839c8fa 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: virtualmcpcompositetooldefinitions.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml index 232121fb53..fcf41a4b6f 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: virtualmcpservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator/README.md b/deploy/charts/operator/README.md index 3727d3bba2..dea310e5e5 100644 --- a/deploy/charts/operator/README.md +++ b/deploy/charts/operator/README.md @@ -46,7 +46,7 @@ The command removes all the Kubernetes components associated with the chart and |-----|------|---------|-------------| | fullnameOverride | string | `"toolhive-operator"` | Provide a fully-qualified name override for resources | | nameOverride | string | `""` | Override the name of the chart | -| operator | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80},"containerSecurityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"readOnlyRootFilesystem":true,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}},"defaultImagePullSecrets":[],"env":[],"features":{"experimental":false},"gc":{"gogc":75,"gomemlimit":"110MiB"},"image":"ghcr.io/stacklok/toolhive/operator:v0.28.1","imagePullPolicy":"IfNotPresent","imagePullSecrets":[],"leaderElectionRole":{"binding":{"name":"toolhive-operator-leader-election-rolebinding"},"name":"toolhive-operator-leader-election-role","rules":[{"apiGroups":[""],"resources":["configmaps"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["coordination.k8s.io"],"resources":["leases"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["events.k8s.io"],"resources":["events"],"verbs":["create","patch"]}]},"livenessProbe":{"httpGet":{"path":"/healthz","port":"health"},"initialDelaySeconds":15,"periodSeconds":20},"nodeSelector":{},"podAnnotations":{},"podLabels":{},"podSecurityContext":{"runAsNonRoot":true},"ports":[{"containerPort":8080,"name":"metrics","protocol":"TCP"},{"containerPort":8081,"name":"health","protocol":"TCP"}],"proxyHost":"0.0.0.0","rbac":{"allowedNamespaces":[],"scope":"cluster"},"readinessProbe":{"httpGet":{"path":"/readyz","port":"health"},"initialDelaySeconds":5,"periodSeconds":10},"replicaCount":1,"resources":{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}},"serviceAccount":{"annotations":{},"automountServiceAccountToken":true,"create":true,"labels":{},"name":"toolhive-operator"},"tolerations":[],"toolhiveRunnerImage":"ghcr.io/stacklok/toolhive/proxyrunner:v0.28.1","vmcpImage":"ghcr.io/stacklok/toolhive/vmcp:v0.28.1","volumeMounts":[],"volumes":[]}` | All values for the operator deployment and associated resources | +| operator | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80},"containerSecurityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"readOnlyRootFilesystem":true,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}},"defaultImagePullSecrets":[],"env":[],"features":{"experimental":false,"storageVersionMigrator":true},"gc":{"gogc":75,"gomemlimit":"110MiB"},"image":"ghcr.io/stacklok/toolhive/operator:v0.28.1","imagePullPolicy":"IfNotPresent","imagePullSecrets":[],"leaderElectionRole":{"binding":{"name":"toolhive-operator-leader-election-rolebinding"},"name":"toolhive-operator-leader-election-role","rules":[{"apiGroups":[""],"resources":["configmaps"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["coordination.k8s.io"],"resources":["leases"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["events.k8s.io"],"resources":["events"],"verbs":["create","patch"]}]},"livenessProbe":{"httpGet":{"path":"/healthz","port":"health"},"initialDelaySeconds":15,"periodSeconds":20},"nodeSelector":{},"podAnnotations":{},"podLabels":{},"podSecurityContext":{"runAsNonRoot":true},"ports":[{"containerPort":8080,"name":"metrics","protocol":"TCP"},{"containerPort":8081,"name":"health","protocol":"TCP"}],"proxyHost":"0.0.0.0","rbac":{"allowedNamespaces":[],"scope":"cluster"},"readinessProbe":{"httpGet":{"path":"/readyz","port":"health"},"initialDelaySeconds":5,"periodSeconds":10},"replicaCount":1,"resources":{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}},"serviceAccount":{"annotations":{},"automountServiceAccountToken":true,"create":true,"labels":{},"name":"toolhive-operator"},"tolerations":[],"toolhiveRunnerImage":"ghcr.io/stacklok/toolhive/proxyrunner:v0.28.1","vmcpImage":"ghcr.io/stacklok/toolhive/vmcp:v0.28.1","volumeMounts":[],"volumes":[]}` | All values for the operator deployment and associated resources | | operator.affinity | object | `{}` | Affinity settings for the operator pod | | operator.autoscaling | object | `{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}` | Configuration for horizontal pod autoscaling | | operator.autoscaling.enabled | bool | `false` | Enable autoscaling for the operator | @@ -57,6 +57,7 @@ The command removes all the Kubernetes components associated with the chart and | operator.defaultImagePullSecrets | list | `[]` | List of image pull secrets that the operator applies as defaults to every workload it spawns (proxy runners, vMCP servers, registry API, etc.). Per-CR `imagePullSecrets` take precedence on name collisions; chart-level entries are appended additively. The operator parses these once at startup from the TOOLHIVE_DEFAULT_IMAGE_PULL_SECRETS environment variable. The Secrets must exist in the namespace where each workload is created. Each entry may be either a plain string (the Secret name) or an object with a `name` field, e.g.: defaultImagePullSecrets: - regcred - name: otherscred The two shapes are equivalent; the object form matches `operator.imagePullSecrets` above for convenience. | | operator.env | list | `[]` | Environment variables to set in the operator container. Supported toolhive-specific variables include: - TOOLHIVE_SKIP_UPDATE_CHECK: set to "true" to disable the operator's periodic update check against the ToolHive update API. Also disables the usage-metrics collection that is gated on the same check. | | operator.features.experimental | bool | `false` | Enable experimental features | +| operator.features.storageVersionMigrator | bool | `true` | Enable the StorageVersionMigrator controller, which auto-cleans status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a future release can drop deprecated versions (e.g. v1alpha1) without orphaning etcd objects. Leave this on unless you are running kube-storage-version-migrator externally. This automatically sets ENABLE_STORAGE_VERSION_MIGRATOR environment variable. | | operator.gc | object | `{"gogc":75,"gomemlimit":"110MiB"}` | Go memory limits and garbage collection percentage for the operator container | | operator.gc.gogc | int | `75` | Go garbage collection percentage for the operator container | | operator.gc.gomemlimit | string | `"110MiB"` | Go memory limits for the operator container | diff --git a/deploy/charts/operator/templates/clusterrole/role.yaml b/deploy/charts/operator/templates/clusterrole/role.yaml index a7a8b3f6e4..392730b4a5 100644 --- a/deploy/charts/operator/templates/clusterrole/role.yaml +++ b/deploy/charts/operator/templates/clusterrole/role.yaml @@ -41,6 +41,21 @@ rules: - pods/log verbs: - get +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions/status + verbs: + - patch + - update - apiGroups: - apps resources: @@ -98,27 +113,15 @@ rules: - apiGroups: - toolhive.stacklok.dev resources: - - embeddingservers - - mcpexternalauthconfigs - - mcpgroups - - mcpoidcconfigs - - mcpregistries - - mcpremoteproxies - - mcpservers - - mcptoolconfigs - - mcpwebhookconfigs - - virtualmcpservers + - '*' verbs: - - create - - delete - get - list - - patch - update - - watch - apiGroups: - toolhive.stacklok.dev resources: + - '*/status' - embeddingservers/finalizers - mcpexternalauthconfigs/finalizers - mcpgroups/finalizers @@ -130,6 +133,27 @@ rules: - mcpwebhookconfigs/finalizers verbs: - update +- apiGroups: + - toolhive.stacklok.dev + resources: + - embeddingservers + - mcpexternalauthconfigs + - mcpgroups + - mcpoidcconfigs + - mcpregistries + - mcpremoteproxies + - mcpservers + - mcptoolconfigs + - mcpwebhookconfigs + - virtualmcpservers + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - toolhive.stacklok.dev resources: diff --git a/deploy/charts/operator/templates/deployment.yaml b/deploy/charts/operator/templates/deployment.yaml index fa81ad160b..10c69c64e3 100644 --- a/deploy/charts/operator/templates/deployment.yaml +++ b/deploy/charts/operator/templates/deployment.yaml @@ -68,6 +68,8 @@ spec: value: "true" - name: ENABLE_EXPERIMENTAL_FEATURES value: {{ .Values.operator.features.experimental | quote }} + - name: ENABLE_STORAGE_VERSION_MIGRATOR + value: {{ .Values.operator.features.storageVersionMigrator | quote }} {{- if eq .Values.operator.rbac.scope "namespace" }} - name: WATCH_NAMESPACE value: "{{ .Values.operator.rbac.allowedNamespaces | join "," }}" diff --git a/deploy/charts/operator/values.yaml b/deploy/charts/operator/values.yaml index 423e56ef36..40ba884853 100644 --- a/deploy/charts/operator/values.yaml +++ b/deploy/charts/operator/values.yaml @@ -8,6 +8,13 @@ operator: features: # -- Enable experimental features experimental: false + # -- Enable the StorageVersionMigrator controller, which auto-cleans + # status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a + # future release can drop deprecated versions (e.g. v1alpha1) without + # orphaning etcd objects. Leave this on unless you are running + # kube-storage-version-migrator externally. + # This automatically sets ENABLE_STORAGE_VERSION_MIGRATOR environment variable. + storageVersionMigrator: true # -- Number of replicas for the operator deployment replicaCount: 1 diff --git a/docs/operator/storage-version-migration.md b/docs/operator/storage-version-migration.md new file mode 100644 index 0000000000..828ddc1831 --- /dev/null +++ b/docs/operator/storage-version-migration.md @@ -0,0 +1,116 @@ +# Storage Version Migration + +The ToolHive operator ships a `StorageVersionMigrator` controller that keeps every ToolHive CRD's `status.storedVersions` list clean, so a future operator release can drop deprecated API versions (e.g. `v1alpha1`) without orphaning objects in etcd. + +## Why this exists + +When a CRD graduates from, say, `v1alpha1` to `v1beta1` with both versions served and `v1beta1` as the storage version, existing objects continue to work — they are transparently converted on read/write. But the API server records every version that has ever been used for storage in `CustomResourceDefinition.status.storedVersions`. Until that list is trimmed, the Kubernetes API server refuses to let you remove a version from `spec.versions`, because doing so would orphan any etcd-stored objects encoded at that version. + +The cleanup is not automatic. Someone has to re-store every existing object at the current storage version, then explicitly patch `status.storedVersions` to drop the old entry. The `StorageVersionMigrator` controller does this for you, on every opted-in ToolHive CRD, continuously. See [upstream Kubernetes documentation](https://kubernetes.io/docs/tasks/manage-kubernetes-objects/storage-version-migration/) for the mechanism. + +## What the controller does + +For each opted-in CRD: + +1. Reads `spec.versions` to find the entry with `storage: true`. +2. If `status.storedVersions` already equals `[]` and only one version is served, nothing to do. +3. Otherwise, lists every Custom Resource of that kind and, for each one, does a plain `Get` followed by an `Update` of the unmodified live object. The apiserver re-encodes the request body at the current storage version and compares it to what is in etcd. When the CR was originally stored at a different `apiVersion` (the actual migration scenario) the bytes carry a different stamp than etcd's record, the comparison fails, and the write proceeds — re-encoding the object at the current storage version. When the CR is already at the current storage version, the bytes match and the apiserver harmlessly elides the write — there was nothing to migrate. This matches the upstream `kube-storage-version-migrator`'s mechanism, confirmed at [kubernetes-sigs/kube-storage-version-migrator#65](https://github.com/kubernetes-sigs/kube-storage-version-migrator/issues/65). +4. Once every object has been processed, patches `CRD.status.storedVersions` to `[]` using an optimistic-lock merge — so concurrent API-server writes cause a clean retry rather than a silent overwrite. + +### Webhook interaction + +The migrator's writes go through the **main resource** Update path, so any validating or mutating admission webhooks registered on the kind see each request as part of normal admission flow. However, only requests that the apiserver actually persists produce downstream state changes a webhook would meaningfully react to — webhook load is bounded by "objects actually being migrated" (CRs whose etcd bytes carry an older `apiVersion` stamp), not "every reconcile pass × every CR". + +## The opt-in label + +A CRD participates in migration only if it carries: + +```yaml +metadata: + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" +``` + +The label is set at CRD-generation time via a kubebuilder marker on each Go type in `cmd/thv-operator/api/v1beta1/`: + +```go +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true +type MCPServer struct { ... } +``` + +`task operator-manifests` bakes the label into the generated CRD YAML. All current ToolHive root types ship with the marker. A CI test (`TestV1beta1TypesMarkerCoverage`) fails the build if a root type is added without either this marker or an explicit `// +thv:storage-version-migrator:exclude` sibling marker — so the migrator cannot silently forget a new CRD. + +Adding a new CRD that should be migrated: + +```go +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true +type NewShinyThing struct { ... } +``` + +Adding a new CRD that deliberately should NOT be migrated (e.g. an experimental kind that is still stabilising its schema): + +```go +// +thv:storage-version-migrator:exclude +type ExperimentalThing struct { ... } +``` + +## Disabling the controller + +Set the Helm feature flag: + +```yaml +operator: + features: + storageVersionMigrator: false # default: true +``` + +This sets `ENABLE_STORAGE_VERSION_MIGRATOR=false` on the operator Deployment, and the reconciler is not registered with the manager. + +Disable only if you are running an external migrator such as [kube-storage-version-migrator](https://github.com/kubernetes-sigs/kube-storage-version-migrator). Disabling without a replacement is a footgun: the next ToolHive release that removes a deprecated API version will refuse to apply its CRD update until `storedVersions` is cleaned, and you will have to clean it yourself. + +## Per-CRD emergency escape hatch + +Removing the label on a live cluster excludes that single CRD from migration immediately: + +```bash +kubectl label crd/mcpservers.toolhive.stacklok.dev \ + toolhive.stacklok.dev/auto-migrate-storage-version- +``` + +Intended for incident response only. If you deploy the operator via GitOps (Argo CD, Flux) or `helm upgrade`, the chart will re-apply the chart-set label within seconds. Use the `storageVersionMigrator` feature flag for long-term opt-out. + +## Interaction with version removal releases + +The `StorageVersionMigrator` must have had time to run against your cluster *before* an operator release that drops a deprecated CRD version ships. The typical sequence is: + +1. **Release N**: both versions served, newer version is storage, `StorageVersionMigrator` enabled. The controller quietly re-stores all objects and trims `storedVersions` on every cluster during this deprecation window. +2. **Release N+1+**: the deprecated version is removed from `spec.versions`. Because every cluster's `storedVersions` was already cleaned in the previous release, the CRD update applies cleanly. + +If your cluster upgraded directly from a pre-migrator release to the version-removal release without ever running release N, you must clean `storedVersions` manually (or deploy `kube-storage-version-migrator` once) before the upgrade can succeed. + +## Verification + +For any ToolHive CRD in a cluster where the controller has run: + +```bash +kubectl get crd mcpservers.toolhive.stacklok.dev \ + -o jsonpath='{.status.storedVersions}' +# ["v1beta1"] +``` + +If the list contains more than one entry, the controller has not yet finished migrating — check operator logs for reconcile errors and the `StorageVersionMigrationFailed` event on the CRD. + +## RBAC + +The controller requires (generated from kubebuilder markers, applied by the operator Helm chart): + +- `customresourcedefinitions.apiextensions.k8s.io`: `get`, `list`, `watch` +- `customresourcedefinitions/status.apiextensions.k8s.io`: `update`, `patch` +- `*.toolhive.stacklok.dev`: `get`, `list`, `update` +- `*/status.toolhive.stacklok.dev`: `update` + +## Related + +- Issue: [stacklok/toolhive#4969](https://github.com/stacklok/toolhive/issues/4969) +- Kubernetes CRD versioning: [official docs](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definition-versioning/) +- Reference implementation: [kubernetes-sigs/kube-storage-version-migrator](https://github.com/kubernetes-sigs/kube-storage-version-migrator) (the upstream SIG API Machinery tool — our controller uses the same Get+Update mechanism, scoped to ToolHive CRDs and triggered automatically rather than via per-migration `StorageVersionMigration` resources) diff --git a/docs/operator/upgrade-guide/README.md b/docs/operator/upgrade-guide/README.md new file mode 100644 index 0000000000..845a57bc2e --- /dev/null +++ b/docs/operator/upgrade-guide/README.md @@ -0,0 +1,323 @@ +# Upgrade walkthrough — v1alpha1 → v1beta1 with StorageVersionMigrator + +End-to-end manual walkthrough that replays a real user upgrade against a local kind cluster: install the previous v0.21.0 release, create a CR of each of the 12 toolhive CRD kinds at `v1alpha1`, upgrade to the new multi-version chart, deploy this branch's operator with the migrator **disabled**, re-apply the CRs at `v1beta1`, and confirm `status.storedVersions` is stuck at `[v1alpha1, v1beta1]` on every CRD. Then enable the `StorageVersionMigrator` and confirm it converges every CRD to `[v1beta1]`. + +Total run time: ~30 minutes. The slow part is the first `ko build` of the operator + proxyrunner + vmcp images (~3 min); subsequent runs use the build cache and finish in ~30s. + +This guide is the canonical reproducible verification for the migrator. Companion reading: + +- [`docs/operator/storage-version-migration.md`](../storage-version-migration.md) — reference docs for the controller itself (label contract, opt-out, mechanism). +- [Issue #4969](https://github.com/stacklok/toolhive/issues/4969) — the motivating problem. + +## Prerequisites + +- `kind`, `kubectl`, `helm`, `ko`, `task` on PATH (`go install github.com/google/ko@latest` for ko) +- Working directory: the repo root (`task operator-deploy-local` and the relative chart paths assume this). +- Cluster name is `toolhive`. If you already have a cluster with that name, delete it first or run from a different worktree. + +The CR fixtures used below ship alongside this doc: + +- [`crs-v1alpha1.yaml`](./crs-v1alpha1.yaml) — one CR of each of the 12 kinds at `v1alpha1` +- [`crs-v1beta1.yaml`](./crs-v1beta1.yaml) — byte-identical to the v1alpha1 file except for `apiVersion`, simulating what `sed -i 's/v1alpha1/v1beta1/g'` would produce + +--- + +## 0 · Set up the cluster + +```bash +# If you already have a "toolhive" kind cluster from a previous run, delete it +kind delete cluster --name toolhive 2>/dev/null + +kind create cluster --name toolhive --wait 60s +kind get kubeconfig --name toolhive > kconfig.yaml +export KUBECONFIG=$(pwd)/kconfig.yaml +``` + +## 1 · Install v0.21.0 (the last v1alpha1-only release) + +```bash +helm install toolhive-operator-crds \ + oci://ghcr.io/stacklok/toolhive/toolhive-operator-crds \ + --version 0.21.0 --wait + +helm install toolhive-operator \ + oci://ghcr.io/stacklok/toolhive/toolhive-operator \ + --version 0.21.0 \ + --namespace toolhive-system --create-namespace --wait + +kubectl get crd mcpservers.toolhive.stacklok.dev -o jsonpath='{.spec.versions[*].name}' +# expected: v1alpha1 (only one version) + +kubectl wait --for=condition=Available deployment -n toolhive-system --all --timeout=180s +``` + +## 2 · Create one CR of each of the 12 kinds at v1alpha1 + +```bash +kubectl create namespace upgrade-test +kubectl apply -f docs/operator/upgrade-guide/crs-v1alpha1.yaml + +# Confirm all 12 landed +kubectl get \ + mcpservers,mcpremoteproxies,mcptoolconfigs,mcpgroups,embeddingservers,mcpregistries,virtualmcpservers,virtualmcpcompositetooldefinitions,mcpoidcconfigs,mcptelemetryconfigs,mcpexternalauthconfigs,mcpserverentries \ + -n upgrade-test --no-headers | wc -l +# expected: 12 +``` + +## 3 · Let the old operator reconcile + capture the baseline + +```bash +sleep 60 +kubectl get deployments -n upgrade-test +# expected: up to 5 Deployments — test-server, test-remote-proxy, test-virtual-server, +# test-registry-api, and (sometimes) test-embedding shows as a StatefulSet not a Deployment + +# Snapshot the UIDs for later comparison +kubectl get deployments -n upgrade-test \ + -o jsonpath='{range .items[*]}{.metadata.name}={.metadata.uid}{"\n"}{end}' \ + | sort > /tmp/before-upgrade.txt +cat /tmp/before-upgrade.txt +``` + +These UIDs are the canary. If they change after the operator upgrade, a workload was recreated → downtime. + +## 4 · Upgrade the CRDs chart to multi-version + +```bash +helm upgrade toolhive-operator-crds deploy/charts/operator-crds --wait --timeout 120s + +kubectl get crd mcpservers.toolhive.stacklok.dev -o jsonpath='{.spec.versions[*].name}' +# expected: v1alpha1 v1beta1 + +kubectl get crd mcpservers.toolhive.stacklok.dev -o jsonpath='{.spec.versions[?(@.storage==true)].name}' +# expected: v1beta1 +``` + +## 5 · Build the new operator + deploy with the migrator DISABLED + +This is the key deviation from `task operator-deploy-local` — we want to control the feature flag, so we run ko + helm manually rather than using the task. + +```bash +# ~3 minutes on first run, ~30s with build cache +OP=$(KO_DOCKER_REPO=kind.local ko build --local -B ./cmd/thv-operator | tail -1) +PR=$(KO_DOCKER_REPO=kind.local ko build --local -B ./cmd/thv-proxyrunner | tail -1) +VM=$(KO_DOCKER_REPO=kind.local ko build --local -B ./cmd/vmcp | tail -1) + +# Load all three into the kind cluster +kind load docker-image --name toolhive "$OP" +kind load docker-image --name toolhive "$PR" +kind load docker-image --name toolhive "$VM" + +# Persist for later steps +echo "$OP" > /tmp/img-op +echo "$PR" > /tmp/img-pr +echo "$VM" > /tmp/img-vm + +# Helm upgrade with migrator EXPLICITLY DISABLED +helm upgrade toolhive-operator deploy/charts/operator \ + --set operator.image="$OP" \ + --set operator.toolhiveRunnerImage="$PR" \ + --set operator.vmcpImage="$VM" \ + --set operator.features.storageVersionMigrator=false \ + --namespace toolhive-system + +kubectl rollout status deployment -n toolhive-system --timeout=180s + +# Confirm flag took effect +NEW_POD=$(kubectl get pods -n toolhive-system --no-headers | grep "toolhive-operator-" | awk '{print $1}' | head -1) +kubectl get pod "$NEW_POD" -n toolhive-system -o yaml | grep -A1 ENABLE_STORAGE_VERSION_MIGRATOR +# expected: value: "false" + +kubectl logs "$NEW_POD" -n toolhive-system | grep "ENABLE_STORAGE_VERSION_MIGRATOR is disabled" +# expected: one line — "ENABLE_STORAGE_VERSION_MIGRATOR is disabled, skipping StorageVersionMigrator controller" +``` + +## 6 · Verify zero downtime — Deployment UIDs unchanged + +```bash +kubectl get deployments -n upgrade-test \ + -o jsonpath='{range .items[*]}{.metadata.name}={.metadata.uid}{"\n"}{end}' \ + | sort > /tmp/after-upgrade.txt + +diff /tmp/before-upgrade.txt /tmp/after-upgrade.txt && echo "All Deployment UIDs unchanged" +``` + +## 7 · Re-apply all 12 CRs at v1beta1 (the user migration step) + +```bash +kubectl apply -f docs/operator/upgrade-guide/crs-v1beta1.yaml +# expected: 12 "configured" lines (not "created") + +# Wait for the operator to observe each update +sleep 10 +``` + +## 8 · Confirm storedVersions is stuck at `[v1alpha1, v1beta1]` on all 12 CRDs (migrator is OFF) + +```bash +echo "=== storedVersions on ALL 12 CRDs (migrator OFF) ===" +for crd in $(kubectl get crd -o name | grep toolhive.stacklok.dev | sort); do + name=$(echo $crd | sed 's|.*/||') + stored=$(kubectl get $crd -o jsonpath='{.status.storedVersions}') + printf " %-55s %s\n" "$name" "$stored" +done +``` + +Expected: every line ends with `["v1alpha1","v1beta1"]`. This is the "post-graduation, pre-migration" state every cluster lands in after the v0.21.0 → multi-version upgrade. **Without the migrator, this state is permanent** — any future operator release that drops `v1alpha1` from `spec.versions` would fail with: + +``` +status.storedVersions[0]: Invalid value: "v1alpha1": must appear in spec.versions +``` + +## 9 · Enable the migrator + watch it converge + +Helm upgrade to flip the feature flag: + +```bash +OP=$(cat /tmp/img-op) +PR=$(cat /tmp/img-pr) +VM=$(cat /tmp/img-vm) + +helm upgrade toolhive-operator deploy/charts/operator \ + --set operator.image="$OP" \ + --set operator.toolhiveRunnerImage="$PR" \ + --set operator.vmcpImage="$VM" \ + --namespace toolhive-system + +kubectl rollout status deployment -n toolhive-system --timeout=180s + +# Confirm flag is now true +NEW_POD=$(kubectl get pods -n toolhive-system --no-headers | grep "toolhive-operator-" | awk '{print $1}' | head -1) +kubectl get pod "$NEW_POD" -n toolhive-system -o yaml | grep -A1 ENABLE_STORAGE_VERSION_MIGRATOR +# expected: value: "true" +``` + +Wait for convergence: + +```bash +echo "=== waiting up to 60s for all 12 CRDs to converge ===" +for i in $(seq 1 60); do + count=$(for c in $(kubectl get crd -o name | grep toolhive.stacklok.dev); do + kubectl get $c -o jsonpath='{.status.storedVersions}' + echo + done | grep -c '^\["v1beta1"\]$') + if [ "$count" = "12" ]; then + echo "All 12 CRDs converged after ${i}s" + break + fi + sleep 1 +done +``` + +In practice this completes in ~1-2 seconds once the new pod is ready. + +Verify: + +```bash +echo "=== storedVersions on ALL 12 CRDs (migrator ON, post-converge) ===" +for crd in $(kubectl get crd -o name | grep toolhive.stacklok.dev | sort); do + name=$(echo $crd | sed 's|.*/||') + stored=$(kubectl get $crd -o jsonpath='{.status.storedVersions}') + printf " %-55s %s\n" "$name" "$stored" +done +# expected: every line ends with ["v1beta1"] + +echo "=== StorageVersionMigrationSucceeded events ===" +kubectl get events -A --field-selector reason=StorageVersionMigrationSucceeded --no-headers | wc -l +# expected: 12 — one event per CRD + +echo "=== StorageVersionMigrationFailed events (should be 0) ===" +kubectl get events -A --field-selector reason=StorageVersionMigrationFailed --no-headers | wc -l +# expected: 0 +``` + +Confirm CRs still healthy (admission webhooks on MCPServer / MCPGroup / VirtualMCPServer all accepted the per-CR Updates): + +```bash +kubectl get \ + mcpservers,mcpremoteproxies,mcptoolconfigs,mcpgroups,embeddingservers,mcpregistries,virtualmcpservers,virtualmcpcompositetooldefinitions,mcpoidcconfigs,mcptelemetryconfigs,mcpexternalauthconfigs,mcpserverentries \ + -n upgrade-test --no-headers | wc -l +# expected: 12 +``` + +## 10 · (Optional) Inspect migrator logs + +```bash +NEW_POD=$(kubectl get pods -n toolhive-system --no-headers | grep "toolhive-operator-" | awk '{print $1}' | head -1) +kubectl logs "$NEW_POD" -n toolhive-system | grep "storage version migration complete" | wc -l +# expected: 12 lines — one per CRD +``` + +**If this prints 0**, the migration may have happened in a previous container instance (the operator pod can restart for unrelated reasons in a kind cluster — leases, OOM, etc.). Try the previous container's logs: + +```bash +kubectl logs "$NEW_POD" -n toolhive-system --previous | grep "storage version migration complete" | wc -l +``` + +The migration is complete in either case — the events on the CRDs in step 9 are the authoritative signal. + +## 11 · (Optional) Simulate the next release that drops v1alpha1 + +Once `storedVersions` is `[v1beta1]` everywhere, the apiserver will accept removal of `v1alpha1` from `spec.versions` — the safety interlock the migrator exists to satisfy. To demonstrate: + +```bash +# Direct apiserver patch — the same end state a future "drop v1alpha1" chart would produce. +for crd in $(kubectl get crd -o name | grep toolhive.stacklok.dev); do + name=$(echo $crd | sed 's|.*/||') + newversions=$(kubectl get $crd -o json | jq '.spec.versions | map(select(.name != "v1alpha1"))') + patch=$(jq -n --argjson v "$newversions" '{spec:{versions:$v}}') + printf " %-55s " "$name" + kubectl patch $crd --type=merge -p "$patch" 2>&1 | tail -1 +done +``` + +Every line should say `... patched`. Verify v1alpha1 access now fails: + +```bash +kubectl get mcpgroups.v1alpha1.toolhive.stacklok.dev test-group -n upgrade-test +# expected: error — the server doesn't have a resource type "mcpgroups" + +kubectl get mcpgroups.v1beta1.toolhive.stacklok.dev test-group -n upgrade-test +# expected: the resource +``` + +**Negative test**: if you skip step 9 (the migrator pass) and jump straight to step 11, every `kubectl patch` will fail with: + +``` +Invalid value: "v1alpha1": must appear in spec.versions +``` + +That's the apiserver wall the migrator exists to clear. + +## 12 · Cleanup + +```bash +kind delete cluster --name toolhive +rm -f kconfig.yaml /tmp/before-upgrade.txt /tmp/after-upgrade.txt \ + /tmp/img-op /tmp/img-pr /tmp/img-vm +``` + +--- + +## Summary of what's verified + +| Check | Validates | Step | +|---|---|---| +| Operator startup logs show migrator skipped when disabled | The `setupStorageVersionMigrator` branch in `app/app.go` honours `ENABLE_STORAGE_VERSION_MIGRATOR=false` | 5 | +| Zero-downtime upgrade across operator chart upgrade | The PR's operator changes don't recreate any managed workload | 6 | +| `storedVersions` is `[v1alpha1, v1beta1]` after re-apply with migrator OFF | Migrator did not run; baseline state is correctly preserved | 8 | +| Helm-upgrade flip enables the migrator | Feature flag round-trips correctly | 9 | +| `storedVersions` converges to `[v1beta1]` on all 12 CRDs | The actual migration logic works against real ToolHive CRDs | 9 | +| 12 `StorageVersionMigrationSucceeded` events on the CRDs | The Recorder is correctly wired and per-CRD migrations are observable | 9 | +| 0 `StorageVersionMigrationFailed` events | No CRD's per-CR Update loop returned a non-retriable error | 9 | +| All 12 CRs still healthy after migration | Validating admission webhooks (MCPServer / MCPGroup / VirtualMCPServer) tolerate the per-CR Updates | 9 | +| RBAC permits storedVersions trim | The ClusterRole has the correct verbs for `customresourcedefinitions/status` and `*.toolhive.stacklok.dev/*` | implicit — trim succeeds in step 9 | +| Apiserver permits `v1alpha1` removal from `spec.versions` once `storedVersions` is clean | The deprecation chain end-to-end works | 11 | + +## What this does NOT cover + +- **Mid-migration crash recovery**: no test forces the operator to crash mid-loop. Envtest covers the conflict-handling and re-encode-failure paths separately. +- **Pagination at real-cluster scale**: 12 CRs is well below the 500-default page size. The envtest suite explicitly tests the continue-token loop with 7 CRs at `PageSize=3`. +- **Operator restart resilience under load**: kind clusters are resource-limited and the operator pod sometimes restarts during tests for unrelated reasons (the `--previous` log fallback in step 10 covers this). + +These are covered by the envtest suite at `cmd/thv-operator/test-integration/storageversionmigrator/`. This walkthrough covers what envtest can't: helm chart wiring, real ToolHive CRD schemas with their actual webhooks, and the full upgrade sequence a real user would run. diff --git a/docs/operator/upgrade-guide/crs-v1alpha1.yaml b/docs/operator/upgrade-guide/crs-v1alpha1.yaml new file mode 100644 index 0000000000..ec1a2d0aa5 --- /dev/null +++ b/docs/operator/upgrade-guide/crs-v1alpha1.yaml @@ -0,0 +1,149 @@ +# All 12 ToolHive CR kinds at v1alpha1. +# Apply with: +# kubectl create namespace upgrade-test +# kubectl apply -f crs-v1alpha1.yaml +# +# Note: MCPServerEntry references MCPGroup (groupRef.name: test-group), +# so MCPGroup must be applied first. kubectl handles ordering within a +# single file by best-effort, and since all refs here are by name (not +# UID), no strict ordering is required — missing refs just stay pending. +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPGroup +metadata: + name: test-group + namespace: upgrade-test +spec: + description: Test group for upgrade validation +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPServer +metadata: + name: test-server + namespace: upgrade-test +spec: + image: ghcr.io/github/github-mcp-server + transport: stdio +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPRemoteProxy +metadata: + name: test-remote-proxy + namespace: upgrade-test +spec: + remoteUrl: https://example.com/mcp + transport: streamable-http +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPToolConfig +metadata: + name: test-tool-config + namespace: upgrade-test +spec: + toolsFilter: + - allowed_tool +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: EmbeddingServer +metadata: + name: test-embedding + namespace: upgrade-test +spec: + image: ghcr.io/stacklok/test-embedding:latest + model: test-model +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPRegistry +metadata: + name: test-registry + namespace: upgrade-test +spec: + displayName: Test Registry + configYAML: | + sources: + - name: k8s + format: upstream + kubernetes: {} + registries: + - name: default + sources: ["k8s"] + database: + host: postgres + port: 5432 + user: db_app + database: registry + auth: + mode: anonymous +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPOIDCConfig +metadata: + name: test-oidc + namespace: upgrade-test +spec: + type: inline + inline: + issuer: https://auth.example.com + jwksUrl: https://auth.example.com/.well-known/jwks.json +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPTelemetryConfig +metadata: + name: test-telemetry + namespace: upgrade-test +spec: + openTelemetry: + endpoint: http://otel-collector:4317 + metrics: + enabled: true + tracing: + enabled: true +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPExternalAuthConfig +metadata: + name: test-external-auth + namespace: upgrade-test +spec: + type: tokenExchange + tokenExchange: + tokenUrl: https://auth.example.com/token + audience: https://api.example.com + clientId: test-client + clientSecretRef: + name: test-secret + key: client-secret +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: VirtualMCPServer +metadata: + name: test-virtual-server + namespace: upgrade-test +spec: + groupRef: + name: test-group + incomingAuth: + type: anonymous +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: VirtualMCPCompositeToolDefinition +metadata: + name: test-composite-tool + namespace: upgrade-test +spec: + name: test_workflow + description: Test composite workflow + steps: + - id: step-1 + tool: placeholder.tool +--- +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: MCPServerEntry +metadata: + name: test-server-entry + namespace: upgrade-test +spec: + groupRef: + name: test-group + transport: streamable-http + remoteUrl: https://example.com/mcp diff --git a/docs/operator/upgrade-guide/crs-v1beta1.yaml b/docs/operator/upgrade-guide/crs-v1beta1.yaml new file mode 100644 index 0000000000..b8570aa141 --- /dev/null +++ b/docs/operator/upgrade-guide/crs-v1beta1.yaml @@ -0,0 +1,149 @@ +# All 12 ToolHive CR kinds at v1beta1. +# Apply with: +# kubectl create namespace upgrade-test +# kubectl apply -f crs-v1beta1.yaml +# +# Note: MCPServerEntry references MCPGroup (groupRef.name: test-group), +# so MCPGroup must be applied first. kubectl handles ordering within a +# single file by best-effort, and since all refs here are by name (not +# UID), no strict ordering is required — missing refs just stay pending. +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPGroup +metadata: + name: test-group + namespace: upgrade-test +spec: + description: Test group for upgrade validation +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPServer +metadata: + name: test-server + namespace: upgrade-test +spec: + image: ghcr.io/github/github-mcp-server + transport: stdio +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPRemoteProxy +metadata: + name: test-remote-proxy + namespace: upgrade-test +spec: + remoteUrl: https://example.com/mcp + transport: streamable-http +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPToolConfig +metadata: + name: test-tool-config + namespace: upgrade-test +spec: + toolsFilter: + - allowed_tool +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: EmbeddingServer +metadata: + name: test-embedding + namespace: upgrade-test +spec: + image: ghcr.io/stacklok/test-embedding:latest + model: test-model +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPRegistry +metadata: + name: test-registry + namespace: upgrade-test +spec: + displayName: Test Registry + configYAML: | + sources: + - name: k8s + format: upstream + kubernetes: {} + registries: + - name: default + sources: ["k8s"] + database: + host: postgres + port: 5432 + user: db_app + database: registry + auth: + mode: anonymous +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPOIDCConfig +metadata: + name: test-oidc + namespace: upgrade-test +spec: + type: inline + inline: + issuer: https://auth.example.com + jwksUrl: https://auth.example.com/.well-known/jwks.json +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPTelemetryConfig +metadata: + name: test-telemetry + namespace: upgrade-test +spec: + openTelemetry: + endpoint: http://otel-collector:4317 + metrics: + enabled: true + tracing: + enabled: true +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPExternalAuthConfig +metadata: + name: test-external-auth + namespace: upgrade-test +spec: + type: tokenExchange + tokenExchange: + tokenUrl: https://auth.example.com/token + audience: https://api.example.com + clientId: test-client + clientSecretRef: + name: test-secret + key: client-secret +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: VirtualMCPServer +metadata: + name: test-virtual-server + namespace: upgrade-test +spec: + groupRef: + name: test-group + incomingAuth: + type: anonymous +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: VirtualMCPCompositeToolDefinition +metadata: + name: test-composite-tool + namespace: upgrade-test +spec: + name: test_workflow + description: Test composite workflow + steps: + - id: step-1 + tool: placeholder.tool +--- +apiVersion: toolhive.stacklok.dev/v1beta1 +kind: MCPServerEntry +metadata: + name: test-server-entry + namespace: upgrade-test +spec: + groupRef: + name: test-group + transport: streamable-http + remoteUrl: https://example.com/mcp