From 130db877fffca62bed9ec5ecff3cf2914af158a0 Mon Sep 17 00:00:00 2001 From: Camila Macedo <7708031+camilamacedo86@users.noreply.github.com> Date: Sun, 11 Jan 2026 05:17:30 +0000 Subject: [PATCH 1/2] test: add e2e tests for workload resilience when catalog is deleted Assisted-by: Cursor --- test/e2e/features/recover.feature | 224 ++++++++++++++++++++++++++++++ test/e2e/steps/steps.go | 12 ++ 2 files changed, 236 insertions(+) diff --git a/test/e2e/features/recover.feature b/test/e2e/features/recover.feature index 0438f2d1a1..b03252eca5 100644 --- a/test/e2e/features/recover.feature +++ b/test/e2e/features/recover.feature @@ -115,3 +115,227 @@ Feature: Recover cluster extension from errors that might occur during its lifet Then ClusterExtension is available And ClusterExtension reports Progressing as True with Reason Succeeded And ClusterExtension reports Installed as True + + # CATALOG DELETION RESILIENCE SCENARIOS + + Scenario: Extension continues running after catalog deletion + Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + And ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And resource "deployment/test-operator" is available + And resource "configmap/test-configmap" is available + And ClusterCatalog "test" is deleted + # Catalog deletion triggers automatic reconciliation via ClusterCatalog watch + # Wait for reconciliation to complete by checking the status condition + When ClusterExtension reports Installed as True + Then resource "deployment/test-operator" is available + And resource "configmap/test-configmap" is available + + Scenario: Resources are restored after catalog deletion + Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + And ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And resource "configmap/test-configmap" exists + And ClusterCatalog "test" is deleted + When resource "configmap/test-configmap" is removed + Then resource "configmap/test-configmap" is eventually restored + + Scenario: Config changes are allowed even when the catalog does not exist anymore + Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + And ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And ClusterCatalog "test" is deleted + When ClusterExtension is updated to add preflight config + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + install: + preflight: + crdUpgradeSafety: + enforcement: None + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + # Wait for reconciliation of the updated spec (config change should succeed without catalog) + And ClusterExtension is available + Then ClusterExtension reports Installed as True + + Scenario: Version upgrade does not proceed when catalog does not exist + Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + And ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.0.0" + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And bundle "test-operator.1.0.0" is installed in version "1.0.0" + And ClusterCatalog "test" is deleted + When ClusterExtension is updated to change version + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.0.1" + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + # Wait for reconciliation after the version change request + And ClusterExtension reports Progressing as True with Reason Retrying + # Verify upgrade did not proceed: version remains at 1.0.0 (not 1.0.1) + Then bundle "test-operator.1.0.0" is installed in version "1.0.0" + And ClusterExtension reports Installed as True + # Note: Retrying status means controller will auto-upgrade when catalog becomes available + + @BoxcutterRuntime + Scenario: Extension with revisions continues running after catalog deletion + Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + And ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + And resource "deployment/test-operator" is available + And ClusterCatalog "test" is deleted + # Wait for reconciliation to complete after catalog deletion + When ClusterExtension reports Installed as True + Then resource "deployment/test-operator" is available + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + + @BoxcutterRuntime + Scenario: Revision remains available after catalog deletion + Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + And ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.2.0" + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + And ClusterCatalog "test" is deleted + # Catalog deletion should NOT affect workload availability + # Wait for reconciliation to complete after catalog deletion + When ClusterExtension reports Installed as True + Then ClusterExtension is available + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + And resource "deployment/test-operator" is available diff --git a/test/e2e/steps/steps.go b/test/e2e/steps/steps.go index e1a55934dd..ea988bd0de 100644 --- a/test/e2e/steps/steps.go +++ b/test/e2e/steps/steps.go @@ -85,6 +85,7 @@ func RegisterSteps(sc *godog.ScenarioContext) { sc.Step(`^(?i)ClusterCatalog "([^"]+)" serves bundles$`, CatalogServesBundles) sc.Step(`^"([^"]+)" catalog image version "([^"]+)" is also tagged as "([^"]+)"$`, TagCatalogImage) sc.Step(`^(?i)ClusterCatalog "([^"]+)" image version "([^"]+)" is also tagged as "([^"]+)"$`, TagCatalogImage) + sc.Step(`^(?i)ClusterCatalog "([^"]+)" is deleted$`, CatalogIsDeleted) sc.Step(`^(?i)operator "([^"]+)" target namespace is "([^"]+)"$`, OperatorTargetNamespace) sc.Step(`^(?i)Prometheus metrics are returned in the response$`, PrometheusMetricsAreReturned) @@ -664,6 +665,17 @@ func TagCatalogImage(name, oldTag, newTag string) error { return crane.Tag(imageRef, newTag, crane.Insecure) } +func CatalogIsDeleted(ctx context.Context, catalogName string) error { + catalogFullName := fmt.Sprintf("%s-catalog", catalogName) + // Using --wait=true makes kubectl wait for the resource to be fully deleted, + // eliminating the need for manual polling + _, err := k8sClient("delete", "clustercatalog", catalogFullName, "--ignore-not-found=true", "--wait=true") + if err != nil { + return fmt.Errorf("failed to delete catalog: %v", err) + } + return nil +} + func PrometheusMetricsAreReturned(ctx context.Context) error { sc := scenarioCtx(ctx) for podName, mr := range sc.metricsResponse { From 7c2362431ea9ea7d835bdedca1e64d69e4ae7f28 Mon Sep 17 00:00:00 2001 From: Camila Macedo <7708031+camilamacedo86@users.noreply.github.com> Date: Sun, 11 Jan 2026 05:39:18 +0000 Subject: [PATCH 2/2] (fix) catalog deletion resilience support Enables installed extensions to continue working when their source catalog becomes unavailable or is deleted. When resolution fails due to catalog unavailability, the operator now continues reconciling with the currently installed bundle instead of failing. Changes: - Resolution falls back to installed bundle when catalog unavailable - Unpacking skipped when maintaining current installed state - Helm and Boxcutter appliers handle nil contentFS gracefully - Version upgrades properly blocked without catalog access This ensures workloads remain stable and operational even when the catalog they were installed from is temporarily unavailable or deleted, while appropriately preventing version changes that require catalog access. Assisted-by: Cursor --- cmd/operator-controller/main.go | 4 +- .../operator-controller/applier/boxcutter.go | 46 +- .../applier/boxcutter_test.go | 6 +- internal/operator-controller/applier/helm.go | 66 +++ .../controllers/boxcutter_reconcile_steps.go | 5 +- .../boxcutter_reconcile_steps_apply_test.go | 4 +- .../clusterextension_admission_test.go | 4 +- .../clusterextension_controller.go | 2 + .../clusterextension_controller_test.go | 440 ++++++++++++++++++ .../clusterextension_reconcile_steps.go | 196 +++++++- .../controllers/suite_test.go | 2 +- test/utils.go | 1 + 12 files changed, 742 insertions(+), 34 deletions(-) diff --git a/cmd/operator-controller/main.go b/cmd/operator-controller/main.go index 2d78edda4f..eef9dfc03d 100644 --- a/cmd/operator-controller/main.go +++ b/cmd/operator-controller/main.go @@ -623,7 +623,7 @@ func (c *boxcutterReconcilerConfigurator) Configure(ceReconciler *controllers.Cl controllers.HandleFinalizers(c.finalizers), controllers.MigrateStorage(storageMigrator), controllers.RetrieveRevisionStates(revisionStatesGetter), - controllers.ResolveBundle(c.resolver), + controllers.ResolveBundle(c.resolver, c.mgr.GetClient()), controllers.UnpackBundle(c.imagePuller, c.imageCache), controllers.ApplyBundleWithBoxcutter(appl.Apply), } @@ -742,7 +742,7 @@ func (c *helmReconcilerConfigurator) Configure(ceReconciler *controllers.Cluster ceReconciler.ReconcileSteps = []controllers.ReconcileStepFunc{ controllers.HandleFinalizers(c.finalizers), controllers.RetrieveRevisionStates(revisionStatesGetter), - controllers.ResolveBundle(c.resolver), + controllers.ResolveBundle(c.resolver, c.mgr.GetClient()), controllers.UnpackBundle(c.imagePuller, c.imageCache), controllers.ApplyBundle(appl), } diff --git a/internal/operator-controller/applier/boxcutter.go b/internal/operator-controller/applier/boxcutter.go index 63578e9cb8..8a0264c843 100644 --- a/internal/operator-controller/applier/boxcutter.go +++ b/internal/operator-controller/applier/boxcutter.go @@ -303,21 +303,37 @@ func (bc *Boxcutter) createOrUpdate(ctx context.Context, obj client.Object) erro return bc.Client.Patch(ctx, obj, client.Apply, client.FieldOwner(bc.FieldOwner), client.ForceOwnership) } -func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) error { - // Generate desired revision - desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations) +func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) (bool, string, error) { + // Note: We list revisions first (before checking contentFS) because we need the revision list + // to determine if we can fall back when contentFS is nil. If the API call fails here, + // it indicates a serious cluster connectivity issue, and we should not proceed even in fallback mode + // since the ClusterExtensionRevision controller also requires API access to maintain resources. + existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName()) if err != nil { - return err + return false, "", err } - if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil { - return fmt.Errorf("set ownerref: %w", err) + // If contentFS is nil, we're maintaining the current state without catalog access. + // In this case, we should use the existing installed revision without generating a new one. + if contentFS == nil { + if len(existingRevisions) == 0 { + return false, "", fmt.Errorf("catalog content unavailable and no revision installed") + } + // Returning true here signals that the rollout has succeeded using the current revision. The + // ClusterExtensionRevision controller will continue to reconcile, apply, and maintain the + // resources defined in that revision via Server-Side Apply, ensuring the workload keeps running + // even when catalog access (and thus new revision content) is unavailable. + return true, "", nil } - // List all existing revisions - existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName()) + // Generate desired revision + desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations) if err != nil { - return err + return false, "", err + } + + if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil { + return false, "", fmt.Errorf("set ownerref: %w", err) } currentRevision := &ocv1.ClusterExtensionRevision{} @@ -339,7 +355,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust // inplace patch was successful, no changes in phases state = StateUnchanged default: - return fmt.Errorf("patching %s Revision: %w", desiredRevision.Name, err) + return false, "", fmt.Errorf("patching %s Revision: %w", desiredRevision.Name, err) } } @@ -353,7 +369,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust case StateNeedsInstall: err := preflight.Install(ctx, plainObjs) if err != nil { - return err + return false, "", err } // TODO: jlanford's IDE says that "StateNeedsUpgrade" condition is always true, but // it isn't immediately obvious why that is. Perhaps len(existingRevisions) is @@ -362,7 +378,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust case StateNeedsUpgrade: err := preflight.Upgrade(ctx, plainObjs) if err != nil { - return err + return false, "", err } } } @@ -376,15 +392,15 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust desiredRevision.Spec.Revision = revisionNumber if err = bc.garbageCollectOldRevisions(ctx, prevRevisions); err != nil { - return fmt.Errorf("garbage collecting old revisions: %w", err) + return false, "", fmt.Errorf("garbage collecting old revisions: %w", err) } if err := bc.createOrUpdate(ctx, desiredRevision); err != nil { - return fmt.Errorf("creating new Revision: %w", err) + return false, "", fmt.Errorf("creating new Revision: %w", err) } } - return nil + return true, "", nil } // garbageCollectOldRevisions deletes archived revisions beyond ClusterExtensionRevisionRetentionLimit. diff --git a/internal/operator-controller/applier/boxcutter_test.go b/internal/operator-controller/applier/boxcutter_test.go index 081747285c..b61ae16a5a 100644 --- a/internal/operator-controller/applier/boxcutter_test.go +++ b/internal/operator-controller/applier/boxcutter_test.go @@ -926,14 +926,18 @@ func TestBoxcutter_Apply(t *testing.T) { labels.PackageNameKey: "test-package", } } - err := boxcutter.Apply(t.Context(), testFS, ext, nil, revisionAnnotations) + completed, status, err := boxcutter.Apply(t.Context(), testFS, ext, nil, revisionAnnotations) // Assert if tc.expectedErr != "" { require.Error(t, err) assert.Contains(t, err.Error(), tc.expectedErr) + assert.False(t, completed) + assert.Empty(t, status) } else { require.NoError(t, err) + assert.True(t, completed) + assert.Empty(t, status) } if tc.validate != nil { diff --git a/internal/operator-controller/applier/helm.go b/internal/operator-controller/applier/helm.go index 22ed096fe1..7450872bdd 100644 --- a/internal/operator-controller/applier/helm.go +++ b/internal/operator-controller/applier/helm.go @@ -103,6 +103,16 @@ func (h *Helm) runPreAuthorizationChecks(ctx context.Context, ext *ocv1.ClusterE } func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels map[string]string, storageLabels map[string]string) (bool, string, error) { + // If contentFS is nil, we're maintaining the current state without catalog access. + // In this case, reconcile the existing Helm release if it exists. + if contentFS == nil { + ac, err := h.ActionClientGetter.ActionClientFor(ctx, ext) + if err != nil { + return false, "", err + } + return h.reconcileExistingRelease(ctx, ac, ext) + } + chrt, err := h.buildHelmChart(contentFS, ext) if err != nil { return false, "", err @@ -197,6 +207,62 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte return true, "", nil } +// reconcileExistingRelease reconciles an existing Helm release without catalog access. +// This is used when the catalog is unavailable but we need to maintain the current installation. +// It reconciles the release to actively maintain resources, and sets up watchers for monitoring/observability. +func (h *Helm) reconcileExistingRelease(ctx context.Context, ac helmclient.ActionInterface, ext *ocv1.ClusterExtension) (bool, string, error) { + rel, err := ac.Get(ext.GetName()) + if errors.Is(err, driver.ErrReleaseNotFound) { + return false, "", fmt.Errorf("catalog content unavailable and no release installed") + } + if err != nil { + return false, "", fmt.Errorf("failed to get current release: %w", err) + } + + // Reconcile the existing release to ensure resources are maintained + if err := ac.Reconcile(rel); err != nil { + // Reconcile failed - resources NOT maintained + // Return false (rollout failed) with error + return false, "", err + } + + // At this point: Reconcile succeeded - resources ARE maintained (applied to cluster via Server-Side Apply) + // The operations below are for setting up watches to detect drift (i.e., if someone manually modifies the + // resources). If watch setup fails, the resources are still successfully maintained, but we won't detect + // and auto-correct manual modifications. We return true (rollout succeeded) and log watch errors. + logger := klog.FromContext(ctx) + + relObjects, err := util.ManifestObjects(strings.NewReader(rel.Manifest), fmt.Sprintf("%s-release-manifest", rel.Name)) + if err != nil { + logger.Error(err, "failed to parse manifest objects, cannot set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + + logger.V(1).Info("setting up drift detection watches on managed objects") + + // Defensive nil checks to prevent panics if Manager or Watcher not properly initialized + if h.Manager == nil { + logger.Error(fmt.Errorf("manager is nil"), "Manager not initialized, cannot set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + cache, err := h.Manager.Get(ctx, ext) + if err != nil { + logger.Error(err, "failed to get managed content cache, cannot set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + + if h.Watcher == nil { + logger.Error(fmt.Errorf("watcher is nil"), "Watcher not initialized, cannot set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + if err := cache.Watch(ctx, h.Watcher, relObjects...); err != nil { + logger.Error(err, "failed to set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + + return true, "", nil +} + func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*chart.Chart, error) { if h.HelmChartProvider == nil { return nil, errors.New("HelmChartProvider is nil") diff --git a/internal/operator-controller/controllers/boxcutter_reconcile_steps.go b/internal/operator-controller/controllers/boxcutter_reconcile_steps.go index 4c9eec36ce..5c746c7b0f 100644 --- a/internal/operator-controller/controllers/boxcutter_reconcile_steps.go +++ b/internal/operator-controller/controllers/boxcutter_reconcile_steps.go @@ -94,7 +94,7 @@ func MigrateStorage(m StorageMigrator) ReconcileStepFunc { } } -func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) error) ReconcileStepFunc { +func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) (bool, string, error)) ReconcileStepFunc { return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) { l := log.FromContext(ctx) revisionAnnotations := map[string]string{ @@ -109,7 +109,8 @@ func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, e } l.Info("applying bundle contents") - if err := apply(ctx, state.imageFS, ext, objLbls, revisionAnnotations); err != nil { + _, _, err := apply(ctx, state.imageFS, ext, objLbls, revisionAnnotations) + if err != nil { // If there was an error applying the resolved bundle, // report the error via the Progressing condition. setStatusProgressing(ext, wrapErrorWithResolutionInfo(state.resolvedRevisionMetadata.BundleMetadata, err)) diff --git a/internal/operator-controller/controllers/boxcutter_reconcile_steps_apply_test.go b/internal/operator-controller/controllers/boxcutter_reconcile_steps_apply_test.go index 63aff2b0ad..78adb70090 100644 --- a/internal/operator-controller/controllers/boxcutter_reconcile_steps_apply_test.go +++ b/internal/operator-controller/controllers/boxcutter_reconcile_steps_apply_test.go @@ -133,8 +133,8 @@ func TestApplyBundleWithBoxcutter(t *testing.T) { imageFS: fstest.MapFS{}, } - stepFunc := ApplyBundleWithBoxcutter(func(_ context.Context, _ fs.FS, _ *ocv1.ClusterExtension, _, _ map[string]string) error { - return nil + stepFunc := ApplyBundleWithBoxcutter(func(_ context.Context, _ fs.FS, _ *ocv1.ClusterExtension, _, _ map[string]string) (bool, string, error) { + return true, "", nil }) result, err := stepFunc(ctx, state, ext) require.NoError(t, err) diff --git a/internal/operator-controller/controllers/clusterextension_admission_test.go b/internal/operator-controller/controllers/clusterextension_admission_test.go index 2f8791999f..6ce9fc3c76 100644 --- a/internal/operator-controller/controllers/clusterextension_admission_test.go +++ b/internal/operator-controller/controllers/clusterextension_admission_test.go @@ -13,7 +13,9 @@ import ( ) func TestClusterExtensionSourceConfig(t *testing.T) { - sourceTypeEmptyError := "Invalid value: null" + // NOTE: Kubernetes validation error format for JSON null values varies across K8s versions. + // We check for the common part "Invalid value:" which appears in all versions. + sourceTypeEmptyError := "Invalid value:" sourceTypeMismatchError := "spec.source.sourceType: Unsupported value" sourceConfigInvalidError := "spec.source: Invalid value" // unionField represents the required Catalog or (future) Bundle field required by SourceConfig diff --git a/internal/operator-controller/controllers/clusterextension_controller.go b/internal/operator-controller/controllers/clusterextension_controller.go index 7f3192b0fd..f381152e6c 100644 --- a/internal/operator-controller/controllers/clusterextension_controller.go +++ b/internal/operator-controller/controllers/clusterextension_controller.go @@ -168,6 +168,8 @@ func (r *ClusterExtensionReconciler) Reconcile(ctx context.Context, req ctrl.Req // ensureAllConditionsWithReason checks that all defined condition types exist in the given ClusterExtension, // and assigns a specified reason and custom message to any missing condition. +// +//nolint:unparam // reason parameter is designed to be flexible, even if current callers use the same value func ensureAllConditionsWithReason(ext *ocv1.ClusterExtension, reason v1alpha1.ConditionReason, message string) { for _, condType := range conditionsets.ConditionTypes { cond := apimeta.FindStatusCondition(ext.Status.Conditions, condType) diff --git a/internal/operator-controller/controllers/clusterextension_controller_test.go b/internal/operator-controller/controllers/clusterextension_controller_test.go index 1b09a43adf..eabdba4f94 100644 --- a/internal/operator-controller/controllers/clusterextension_controller_test.go +++ b/internal/operator-controller/controllers/clusterextension_controller_test.go @@ -1611,3 +1611,443 @@ func TestGetInstalledBundleHistory(t *testing.T) { } } } + +// TestResolutionFallbackToInstalledBundle tests the catalog deletion resilience fallback logic +func TestResolutionFallbackToInstalledBundle(t *testing.T) { + t.Run("falls back when catalog unavailable and no version change", func(t *testing.T) { + resolveAttempt := 0 + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + // First reconcile: catalog available, second reconcile: catalog unavailable + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + resolveAttempt++ + if resolveAttempt == 1 { + // First reconcile: catalog available, resolve to version 1.0.0 + v := bundle.VersionRelease{Version: bsemver.MustParse("1.0.0")} + return &declcfg.Bundle{ + Name: "test.1.0.0", + Package: "test-pkg", + Image: "test-image:1.0.0", + }, &v, &declcfg.Deprecation{}, nil + } + // Second reconcile: catalog unavailable + return nil, nil, nil, fmt.Errorf("catalog unavailable") + }) + // Applier succeeds (resources maintained) + d.Applier = &MockApplier{ + installCompleted: true, + installStatus: "", + err: nil, + } + d.ImagePuller = &imageutil.MockPuller{ImageFS: fstest.MapFS{}} + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + Package: "test-pkg", + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + Image: "test-image:1.0.0", + }, + }, + } + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + // Create ClusterExtension with no version specified + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + // No version - should fall back + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // First reconcile: catalog available, install version 1.0.0 + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + require.NoError(t, cl.Get(ctx, extKey, ext)) + require.Equal(t, "1.0.0", ext.Status.Install.Bundle.Version) + + // Verify status after first install + instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason) + + progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason) + + // Verify all conditions are present and valid after first reconcile + verifyInvariants(ctx, t, cl, ext) + + // Second reconcile: catalog unavailable, should fallback to installed version + // Catalog watch will trigger reconciliation when catalog becomes available again + res, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Verify status shows successful reconciliation after fallback + require.NoError(t, cl.Get(ctx, extKey, ext)) + + // Version should remain 1.0.0 (maintained from fallback) + require.Equal(t, "1.0.0", ext.Status.Install.Bundle.Version) + + // Progressing should be Succeeded (apply completed successfully) + progCond = apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason) + + // Installed should be True (maintaining current version) + instCond = apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason) + + // Verify all conditions remain valid after fallback + verifyInvariants(ctx, t, cl, ext) + }) + + t.Run("fails when version upgrade requested without catalog", func(t *testing.T) { + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + return nil, nil, nil, fmt.Errorf("catalog unavailable") + }) + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + }, + }, + } + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + // Create ClusterExtension requesting version upgrade + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Version: "1.0.1", // Requesting upgrade + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // Reconcile should fail (can't upgrade without catalog) + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.Error(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Verify status shows Retrying + require.NoError(t, cl.Get(ctx, extKey, ext)) + + // Progressing should be Retrying (can't resolve without catalog) + progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonRetrying, progCond.Reason) + + // Installed should be True (v1.0.0 is already installed per RevisionStatesGetter) + // but we can't upgrade to v1.0.1 without catalog + instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + + // Verify all conditions are present and valid + verifyInvariants(ctx, t, cl, ext) + }) + + t.Run("auto-updates when catalog becomes available after fallback", func(t *testing.T) { + resolveAttempt := 0 + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + // First attempt: catalog unavailable, then becomes available + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + resolveAttempt++ + if resolveAttempt == 1 { + // First reconcile: catalog unavailable + return nil, nil, nil, fmt.Errorf("catalog temporarily unavailable") + } + // Second reconcile (triggered by catalog watch): catalog available with new version + v := bundle.VersionRelease{Version: bsemver.MustParse("2.0.0")} + return &declcfg.Bundle{ + Name: "test.2.0.0", + Package: "test-pkg", + Image: "test-image:2.0.0", + }, &v, &declcfg.Deprecation{}, nil + }) + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + Package: "test-pkg", + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + Image: "test-image:1.0.0", + }, + }, + } + d.ImagePuller = &imageutil.MockPuller{ImageFS: fstest.MapFS{}} + d.Applier = &MockApplier{installCompleted: true} + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + // No version - auto-update to latest + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // First reconcile: catalog unavailable, falls back to v1.0.0 + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + require.NoError(t, cl.Get(ctx, extKey, ext)) + require.Equal(t, "1.0.0", ext.Status.Install.Bundle.Version) + + // Verify core status after fallback to installed version + instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason) + + progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason) + + // Note: When falling back without catalog access initially, deprecation conditions + // may not be set yet. Full validation happens after catalog is available. + + // Second reconcile: simulating catalog watch trigger, catalog now available with v2.0.0 + res, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Should have upgraded to v2.0.0 + require.NoError(t, cl.Get(ctx, extKey, ext)) + require.Equal(t, "2.0.0", ext.Status.Install.Bundle.Version) + + // Verify status after upgrade + instCond = apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason) + + progCond = apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason) + + // Verify all conditions remain valid after upgrade + verifyInvariants(ctx, t, cl, ext) + + // Verify resolution was attempted twice (fallback, then success) + require.Equal(t, 2, resolveAttempt) + }) + + t.Run("retries when catalogs exist but resolution fails", func(t *testing.T) { + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + // Resolver fails (transient issue) + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + return nil, nil, nil, fmt.Errorf("transient catalog issue: cache stale") + }) + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + Package: "test-pkg", + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + Image: "test-image:1.0.0", + }, + }, + } + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + // Create a ClusterCatalog matching the extension's selector + catalog := &ocv1.ClusterCatalog{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-catalog", + }, + Spec: ocv1.ClusterCatalogSpec{ + Source: ocv1.CatalogSource{ + Type: ocv1.SourceTypeImage, + Image: &ocv1.ImageSource{ + Ref: "test-registry/catalog:latest", + }, + }, + }, + } + require.NoError(t, cl.Create(ctx, catalog)) + + // Create ClusterExtension with no version specified + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + // No version specified + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // Reconcile should fail and RETRY (not fallback) + // Catalogs exist, so this is likely a transient issue (catalog updating, cache stale, etc.) + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.Error(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Verify status shows Retrying (not falling back to installed bundle) + require.NoError(t, cl.Get(ctx, extKey, ext)) + + progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonRetrying, progCond.Reason) + require.Contains(t, progCond.Message, "transient catalog issue") + + // Installed should remain True (existing installation is maintained) + instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + + // Verify we did NOT fall back - status should show we're retrying + verifyInvariants(ctx, t, cl, ext) + + // Clean up the catalog so it doesn't affect other tests + require.NoError(t, cl.Delete(ctx, catalog)) + }) +} + +func TestCheckCatalogsExist(t *testing.T) { + t.Run("returns false when no catalogs exist", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err) + require.False(t, exists, "should return false when no catalogs exist") + }) + + t.Run("returns false when no selector provided", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: nil, // No selector + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err) + require.False(t, exists, "should return false when no catalogs exist (no selector)") + }) + + t.Run("returns false when empty selector provided", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: &metav1.LabelSelector{}, // Empty selector (matches everything) + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err, "empty selector should not cause error") + require.False(t, exists, "should return false when no catalogs exist (empty selector)") + }) + + t.Run("returns error for invalid selector", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "invalid", + Operator: "InvalidOperator", // Invalid operator + Values: []string{"value"}, + }, + }, + }, + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.Error(t, err, "should return error for invalid selector") + require.Contains(t, err.Error(), "invalid catalog selector") + require.False(t, exists) + }) +} diff --git a/internal/operator-controller/controllers/clusterextension_reconcile_steps.go b/internal/operator-controller/controllers/clusterextension_reconcile_steps.go index 09dd2ce7d2..03cecfe1ce 100644 --- a/internal/operator-controller/controllers/clusterextension_reconcile_steps.go +++ b/internal/operator-controller/controllers/clusterextension_reconcile_steps.go @@ -19,10 +19,12 @@ package controllers import ( "context" "errors" + "fmt" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/log" @@ -83,7 +85,14 @@ func RetrieveRevisionStates(r RevisionStatesGetter) ReconcileStepFunc { } } -func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { +// ResolveBundle resolves the bundle to install or roll out for a ClusterExtension. +// It requires a controller-runtime client (in addition to the resolve.Resolver) to enable +// intelligent error handling when resolution fails. The client is used to check if ClusterCatalogs +// matching the extension's selector still exist in the cluster, allowing the controller to +// distinguish between "ClusterCatalog deleted" (fall back to installed bundle) and "transient failure" +// (retry resolution). This ensures workload resilience during ClusterCatalog outages while maintaining +// responsiveness during ClusterCatalog updates. +func ResolveBundle(r resolve.Resolver, c client.Client) ReconcileStepFunc { return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) { l := log.FromContext(ctx) var resolvedRevisionMetadata *RevisionMetadata @@ -95,11 +104,7 @@ func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { } resolvedBundle, resolvedBundleVersion, resolvedDeprecation, err := r.Resolve(ctx, ext, bm) if err != nil { - // Note: We don't distinguish between resolution-specific errors and generic errors - setStatusProgressing(ext, err) - setInstalledStatusFromRevisionStates(ext, state.revisionStates) - ensureAllConditionsWithReason(ext, ocv1.ReasonFailed, err.Error()) - return nil, err + return handleResolutionError(ctx, c, state, ext, err) } // set deprecation status after _successful_ resolution @@ -134,19 +139,190 @@ func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { } } +// handleResolutionError handles the case when bundle resolution fails. +// If a bundle is already installed and the spec isn't requesting a version change, +// we check if the ClusterCatalogs have been deleted. If so, we fall back to using the +// installed bundle to maintain the current state (ClusterCatalog deletion resilience). +// However, if ClusterCatalogs still exist, we retry to allow for transient failures or ClusterCatalog updates. +// If the spec explicitly requests a different version, we must fail and retry regardless. +func handleResolutionError(ctx context.Context, c client.Client, state *reconcileState, ext *ocv1.ClusterExtension, err error) (*ctrl.Result, error) { + l := log.FromContext(ctx) + + // If we have an installed bundle, check if we can fall back to it + if state.revisionStates.Installed != nil { + // Check if the spec is requesting a specific version that differs from installed + specVersion := "" + if ext.Spec.Source.Catalog != nil { + specVersion = ext.Spec.Source.Catalog.Version + } + installedVersion := state.revisionStates.Installed.Version + + // If spec requests a different version, we cannot fall back - must fail and retry + if specVersion != "" && specVersion != installedVersion { + msg := fmt.Sprintf("unable to upgrade to version %s: %v (currently installed: %s)", specVersion, err, installedVersion) + l.Error(err, "resolution failed and spec requests version change - cannot fall back", + "requestedVersion", specVersion, + "installedVersion", installedVersion) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, msg) + return nil, err + } + + // No version change requested - check if ClusterCatalogs exist + // Only fall back if ClusterCatalogs have been deleted + catalogsExist, catalogCheckErr := CheckCatalogsExist(ctx, c, ext) + if catalogCheckErr != nil { + msg := fmt.Sprintf("failed to resolve bundle: %v", err) + catalogName := getCatalogNameFromSelector(ext.Spec.Source.Catalog.Selector) + l.Error(catalogCheckErr, "error checking if ClusterCatalogs exist, will retry resolution", + "resolutionError", err, + "packageName", getPackageName(ext), + "catalogName", catalogName) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, msg) + return nil, err + } + + if catalogsExist { + // ClusterCatalogs exist but resolution failed - likely a transient issue (ClusterCatalog updating, cache stale, etc.) + // Retry resolution instead of falling back + msg := fmt.Sprintf("failed to resolve bundle, retrying: %v", err) + catalogName := getCatalogNameFromSelector(ext.Spec.Source.Catalog.Selector) + l.Error(err, "resolution failed but matching ClusterCatalogs exist - retrying instead of falling back", + "packageName", getPackageName(ext), + "catalogName", catalogName) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, msg) + return nil, err + } + + // ClusterCatalogs don't exist (deleted) - fall back to installed bundle to maintain current state. + // The controller watches ClusterCatalog resources, so when ClusterCatalogs become available again, + // a reconcile will be triggered automatically, allowing the extension to upgrade. + msg := fmt.Sprintf("continuing to maintain current installation at version %s due to catalog unavailability", state.revisionStates.Installed.Version) + catalogName := getCatalogNameFromSelector(ext.Spec.Source.Catalog.Selector) + l.Error(err, "matching ClusterCatalogs unavailable or deleted - falling back to installed bundle to maintain workload", + "packageName", getPackageName(ext), + "catalogName", catalogName, + "installedBundle", state.revisionStates.Installed.Name, + "installedVersion", state.revisionStates.Installed.Version) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, msg) + state.resolvedRevisionMetadata = state.revisionStates.Installed + // Return no error to allow Apply step to run and maintain resources. + return nil, nil + } + + // No installed bundle and resolution failed - cannot proceed + msg := fmt.Sprintf("failed to resolve bundle: %v", err) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, msg) + return nil, err +} + +// getCatalogNameFromSelector extracts the catalog name from the selector if available. +// Returns empty string if selector is nil or doesn't contain the metadata.name label. +func getCatalogNameFromSelector(selector *metav1.LabelSelector) string { + if selector == nil || selector.MatchLabels == nil { + return "" + } + return selector.MatchLabels["olm.operatorframework.io/metadata.name"] +} + +// getPackageName safely extracts the package name from the extension spec. +// Returns empty string if Catalog source is nil. +func getPackageName(ext *ocv1.ClusterExtension) string { + if ext.Spec.Source.Catalog == nil { + return "" + } + return ext.Spec.Source.Catalog.PackageName +} + +// CheckCatalogsExist checks if any ClusterCatalogs matching the extension's selector exist. +// Returns true if at least one matching ClusterCatalog exists, false if none exist. +// Treats "CRD doesn't exist" errors as "no ClusterCatalogs exist" (returns false, nil). +// Returns an error only if the check itself fails unexpectedly. +func CheckCatalogsExist(ctx context.Context, c client.Client, ext *ocv1.ClusterExtension) (bool, error) { + var catalogList *ocv1.ClusterCatalogList + var listErr error + + if ext.Spec.Source.Catalog == nil || ext.Spec.Source.Catalog.Selector == nil { + // No selector means all ClusterCatalogs match - check if any ClusterCatalogs exist at all + catalogList = &ocv1.ClusterCatalogList{} + listErr = c.List(ctx, catalogList, client.Limit(1)) + } else { + // Convert label selector to k8slabels.Selector + // Note: An empty LabelSelector matches everything by default + selector, err := metav1.LabelSelectorAsSelector(ext.Spec.Source.Catalog.Selector) + if err != nil { + return false, fmt.Errorf("invalid catalog selector: %w", err) + } + + // List ClusterCatalogs matching the selector (limit to 1 since we only care if any exist) + catalogList = &ocv1.ClusterCatalogList{} + listErr = c.List(ctx, catalogList, client.MatchingLabelsSelector{Selector: selector}, client.Limit(1)) + } + + if listErr != nil { + // Check if the error is because the ClusterCatalog CRD doesn't exist + // This can happen if catalogd is not installed, which means no ClusterCatalogs exist + if apimeta.IsNoMatchError(listErr) { + return false, nil + } + return false, fmt.Errorf("failed to list ClusterCatalogs: %w", listErr) + } + + return len(catalogList.Items) > 0, nil +} + func UnpackBundle(i imageutil.Puller, cache imageutil.Cache) ReconcileStepFunc { return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) { l := log.FromContext(ctx) - l.Info("unpacking resolved bundle") + + // Defensive check: resolvedRevisionMetadata should be set by ResolveBundle step + if state.resolvedRevisionMetadata == nil { + return nil, fmt.Errorf("unable to retrieve bundle information") + } + + // Always try to pull the bundle content (Pull uses cache-first strategy, so this is efficient) + l.V(1).Info("pulling bundle content") imageFS, _, _, err := i.Pull(ctx, ext.GetName(), state.resolvedRevisionMetadata.Image, cache) + + // Check if resolved bundle matches installed bundle (no version change) + bundleUnchanged := state.revisionStates != nil && + state.revisionStates.Installed != nil && + state.resolvedRevisionMetadata.Name == state.revisionStates.Installed.Name && + state.resolvedRevisionMetadata.Version == state.revisionStates.Installed.Version + if err != nil { - // Wrap the error passed to this with the resolution information until we have successfully - // installed since we intend for the progressing condition to replace the resolved condition - // and will be removing the .status.resolution field from the ClusterExtension status API + if bundleUnchanged { + // Bundle hasn't changed and Pull failed (likely cache miss + catalog unavailable). + // This happens in fallback mode after catalog deletion. Set imageFS to nil so the + // applier can maintain the workload using existing Helm release or ClusterExtensionRevision. + l.V(1).Info("bundle unchanged and content unavailable, continuing with nil contentFS (applier will use existing release/revision)", + "bundle", state.resolvedRevisionMetadata.Name, + "version", state.resolvedRevisionMetadata.Version, + "error", err.Error()) + state.imageFS = nil + return nil, nil + } + // New bundle version but Pull failed - this is an error condition setStatusProgressing(ext, wrapErrorWithResolutionInfo(state.resolvedRevisionMetadata.BundleMetadata, err)) setInstalledStatusFromRevisionStates(ext, state.revisionStates) return nil, err } + + if bundleUnchanged { + l.V(1).Info("bundle unchanged, using cached content for resource reconciliation", + "bundle", state.resolvedRevisionMetadata.Name, + "version", state.resolvedRevisionMetadata.Version) + } + state.imageFS = imageFS return nil, nil } diff --git a/internal/operator-controller/controllers/suite_test.go b/internal/operator-controller/controllers/suite_test.go index 6258a5d307..57305a75fd 100644 --- a/internal/operator-controller/controllers/suite_test.go +++ b/internal/operator-controller/controllers/suite_test.go @@ -106,7 +106,7 @@ func newClientAndReconciler(t *testing.T, opts ...reconcilerOption) (client.Clie } reconciler.ReconcileSteps = []controllers.ReconcileStepFunc{controllers.HandleFinalizers(d.Finalizers), controllers.RetrieveRevisionStates(d.RevisionStatesGetter)} if r := d.Resolver; r != nil { - reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.ResolveBundle(r)) + reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.ResolveBundle(r, cl)) } if i := d.ImagePuller; i != nil { reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.UnpackBundle(i, d.ImageCache)) diff --git a/test/utils.go b/test/utils.go index 22a50b2b8d..60cfaa38a0 100644 --- a/test/utils.go +++ b/test/utils.go @@ -14,6 +14,7 @@ func NewEnv() *envtest.Environment { testEnv := &envtest.Environment{ CRDDirectoryPaths: []string{ pathFromProjectRoot("helm/olmv1/base/operator-controller/crd/experimental"), + pathFromProjectRoot("helm/olmv1/base/catalogd/crd/experimental"), }, ErrorIfCRDPathMissing: true, }