From 34708c118a14ea851751b13d24e66077f7d5626c Mon Sep 17 00:00:00 2001 From: Martin Schuppert Date: Thu, 26 Mar 2026 13:49:44 +0100 Subject: [PATCH] Fix MinorUpdateOVNDataplane race when OVN image unchanged between versions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 63fd70544f removed the nodeset.IsReady() check from DataplaneNodesetsOVNControllerImagesMatch to fix the minor update workflow getting stuck when unrelated deployments were running. That check was too strict — it blocked when any deployment was in progress, even if the OVN update had already completed. However, the remaining pure image comparison is too loose. When the OVN controller image does not change between two OpenStack versions, the nodeset already has the matching image from the previous update. The OpenStackVersion controller then sets MinorUpdateOVNDataplane=True immediately, before the edpm-ovn-update deployment finishes. This causes the subsequent minor update steps (controlplane update, edpm services update) to proceed while the OVN dataplane deployment is still running — resulting in both dataplane deployments running concurrently. Fix this with two mechanisms: 1. IsDataplaneDeploymentRunningForContainerImage: a new generic function that lists all in-progress OpenStackDataPlaneDeployment resources, resolves which services each deploys (from ServicesOverride or the nodeset's service list), and inspects each service's ContainerImageFields to determine if it manages the specified container image. This does not rely on deployment or service naming conventions and works with custom services. 2. Saved condition state tracking: when the OVN image is unchanged between the deployed and target versions, image comparison alone cannot distinguish "already updated" from "not yet updated." To handle this, we use the saved condition state (captured before Init resets conditions each reconciliation) to track whether a running OVN deployment has been observed during this update cycle: - Running OVN deployment seen → set condition False(RequestedReason) - No running deployment + previous condition was RequestedReason → deployment completed, proceed - No running deployment + previous condition was NOT RequestedReason → deployment not created yet, keep waiting The OpenStackVersion controller watches nodesets, so when a deployment starts (nodeset becomes not-Ready) and completes (nodeset becomes Ready), reconciliation is triggered, ensuring we observe the running deployment at least once. When the OVN image differs between versions, the existing image match check is sufficient — the nodeset's ContainerImages are only updated on successful deployment completion, so a match proves the deployment ran. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Martin Schuppert --- .../core/openstackversion_controller.go | 69 ++++++++++++++++ internal/openstack/dataplane.go | 80 +++++++++++++++++++ 2 files changed, 149 insertions(+) diff --git a/internal/controller/core/openstackversion_controller.go b/internal/controller/core/openstackversion_controller.go index 1c2a57b11..3e66e217b 100644 --- a/internal/controller/core/openstackversion_controller.go +++ b/internal/controller/core/openstackversion_controller.go @@ -291,6 +291,75 @@ func (r *OpenStackVersionReconciler) Reconcile(ctx context.Context, req ctrl.Req Log.Info("Waiting on OVN Dataplane updates to complete") return ctrl.Result{}, nil } + + // When the OVN controller image is the same between the deployed + // version and the target version, the image comparison above always + // passes because the nodeset already has the matching image from + // the previous update. In this case we need additional checks to + // confirm the OVN dataplane deployment for this update cycle has + // actually completed. + // + // We use the saved condition state (from before Init reset) to + // track whether we have observed a running OVN deployment during + // this update cycle: + // - If we see a running OVN deployment now: set condition False + // (RequestedReason) to record that we observed one + // - If no running OVN deployment AND the previous condition was + // False/RequestedReason: the deployment we saw previously has + // completed → proceed (fall through to set True) + // - If no running OVN deployment AND the previous condition was + // NOT False/RequestedReason (e.g. still Unknown from Init): + // we haven't seen a deployment yet → keep waiting + // + // When the image differs between versions, the image match alone + // is sufficient proof that a deployment updated it, since the + // nodeset's ContainerImages are only set on successful completion. + deployedDefaults, hasDeployedDefaults := instance.Status.ContainerImageVersionDefaults[*instance.Status.DeployedVersion] + if hasDeployedDefaults && + deployedDefaults.OvnControllerImage != nil && + instance.Status.ContainerImages.OvnControllerImage != nil && + *deployedDefaults.OvnControllerImage == *instance.Status.ContainerImages.OvnControllerImage { + + ovnDeploymentRunning, err := openstack.IsDataplaneDeploymentRunningForContainerImage( + ctx, versionHelper, instance.Namespace, dataplaneNodesets, "OvnControllerImage") + if err != nil { + return ctrl.Result{}, err + } + + if ovnDeploymentRunning { + // OVN deployment is actively running — record this in + // the condition so we can detect its completion later. + instance.Status.Conditions.Set(condition.FalseCondition( + corev1beta1.OpenStackVersionMinorUpdateOVNDataplane, + condition.RequestedReason, + condition.SeverityInfo, + corev1beta1.OpenStackVersionMinorUpdateReadyRunningMessage)) + Log.Info("Waiting on OVN Dataplane deployment to complete (OVN image unchanged between versions)") + return ctrl.Result{}, nil + } + + // No OVN deployment running. Check the saved condition state + // from the previous reconciliation to determine if we ever + // observed one running during this update cycle. + prevOvnDataplaneCond := savedConditions.Get(corev1beta1.OpenStackVersionMinorUpdateOVNDataplane) + if prevOvnDataplaneCond == nil || + prevOvnDataplaneCond.Reason != condition.RequestedReason { + // We have never observed a running OVN deployment in + // this update cycle — the deployment has not been + // created yet. Keep waiting. + instance.Status.Conditions.Set(condition.FalseCondition( + corev1beta1.OpenStackVersionMinorUpdateOVNDataplane, + condition.InitReason, + condition.SeverityInfo, + corev1beta1.OpenStackVersionMinorUpdateReadyRunningMessage)) + Log.Info("Waiting for OVN Dataplane deployment to be created (OVN image unchanged between versions)") + return ctrl.Result{}, nil + } + // Previously saw a running OVN deployment (condition was + // False/RequestedReason), now no OVN deployment is running + // → the deployment has completed. Fall through to set True. + Log.Info("OVN Dataplane deployment completed (OVN image unchanged between versions)") + } } instance.Status.Conditions.MarkTrue( corev1beta1.OpenStackVersionMinorUpdateOVNDataplane, diff --git a/internal/openstack/dataplane.go b/internal/openstack/dataplane.go index c4a456e78..e4c2bd4f4 100644 --- a/internal/openstack/dataplane.go +++ b/internal/openstack/dataplane.go @@ -2,11 +2,13 @@ package openstack import ( "context" + "slices" "github.com/openstack-k8s-operators/lib-common/modules/common/helper" corev1beta1 "github.com/openstack-k8s-operators/openstack-operator/api/core/v1beta1" dataplanev1 "github.com/openstack-k8s-operators/openstack-operator/api/dataplane/v1beta1" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -58,6 +60,84 @@ func DataplaneNodesetsOVNControllerImagesMatch(version *corev1beta1.OpenStackVer return true } +// IsDataplaneDeploymentRunningForContainerImage checks whether any in-progress +// OpenStackDataPlaneDeployment is deploying a service that manages the given +// containerImageField (e.g. "OvnControllerImage"). It resolves which services +// each deployment runs (from ServicesOverride or the nodeset's service list) +// and inspects the service's ContainerImageFields to determine if it manages +// the specified container image. +func IsDataplaneDeploymentRunningForContainerImage( + ctx context.Context, + h *helper.Helper, + namespace string, + dataplaneNodesets *dataplanev1.OpenStackDataPlaneNodeSetList, + containerImageField string, +) (bool, error) { + // List all deployments in the namespace + deployments := &dataplanev1.OpenStackDataPlaneDeploymentList{} + opts := []client.ListOption{ + client.InNamespace(namespace), + } + err := h.GetClient().List(ctx, deployments, opts...) + if err != nil { + return false, err + } + + // Build a map of nodeset name -> nodeset for quick lookup + nodesetMap := make(map[string]*dataplanev1.OpenStackDataPlaneNodeSet, len(dataplaneNodesets.Items)) + for i := range dataplaneNodesets.Items { + nodesetMap[dataplaneNodesets.Items[i].Name] = &dataplaneNodesets.Items[i] + } + + // Cache service lookups to avoid repeated API calls + serviceCache := make(map[string]*dataplanev1.OpenStackDataPlaneService) + + for _, deployment := range deployments.Items { + // Skip completed deployments + if deployment.Status.Deployed { + continue + } + + // Determine which services this deployment runs for each of its nodesets + for _, nodesetName := range deployment.Spec.NodeSets { + nodeset, exists := nodesetMap[nodesetName] + if !exists || len(nodeset.Spec.Nodes) == 0 { + continue + } + + var services []string + if len(deployment.Spec.ServicesOverride) != 0 { + services = deployment.Spec.ServicesOverride + } else { + services = nodeset.Spec.Services + } + + for _, serviceName := range services { + svc, cached := serviceCache[serviceName] + if !cached { + foundService := &dataplanev1.OpenStackDataPlaneService{} + err := h.GetClient().Get(ctx, types.NamespacedName{ + Name: serviceName, + Namespace: namespace, + }, foundService) + if err != nil { + // Service not found — skip it + continue + } + svc = foundService + serviceCache[serviceName] = svc + } + + if slices.Contains(svc.Spec.ContainerImageFields, containerImageField) { + return true, nil + } + } + } + } + + return false, nil +} + // DataplaneNodesetsDeployed returns true if all nodesets are deployed with the latest version func DataplaneNodesetsDeployed(version *corev1beta1.OpenStackVersion, dataplaneNodesets *dataplanev1.OpenStackDataPlaneNodeSetList) bool { for _, nodeset := range dataplaneNodesets.Items {