From 5ec748932ffa84ee3a7325e5b6179549e64ce1e8 Mon Sep 17 00:00:00 2001 From: Nitish Bhat Date: Fri, 3 Apr 2026 06:48:04 -0700 Subject: [PATCH] Fix Node Assignments calculations on operator restart (#1242) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix two bugs in DeviceConfig node assignment management: 1. buildNodeAssignments now logs and skips node assignment conflicts instead of returning a fatal error. A CR-level conflict should not block the entire operator — the runtime validateNodeAssignments check already handles this per-CR during reconciliation. 2. Remove premature updateNodeAssignments call during finalization that freed nodes from the in-memory map before the finalizer was removed. Node cleanup is now handled solely via the NotFound path after CR garbage collection, preventing other DeviceConfigs from claiming nodes mid-finalization. Also adds DRA driver DaemonSet cleanup to the finalization path, which was previously only handled during normal reconciliation. (cherry picked from commit a945553c4fe6061b783fd3826a761ba4d6cf5df9) --- .../controllers/device_config_reconciler.go | 31 ++++-- .../device_config_reconciler_test.go | 105 ++++++++++++++++++ .../mock_device_config_reconciler.go | 8 +- 3 files changed, 133 insertions(+), 11 deletions(-) diff --git a/internal/controllers/device_config_reconciler.go b/internal/controllers/device_config_reconciler.go index d72c5835..8acc864e 100644 --- a/internal/controllers/device_config_reconciler.go +++ b/internal/controllers/device_config_reconciler.go @@ -169,7 +169,7 @@ func (r *DeviceConfigReconciler) init(ctx context.Context) { r.initErr = err return } - r.initErr = r.helper.buildNodeAssignments(deviceConfigList) + r.initErr = r.helper.buildNodeAssignments(ctx, deviceConfigList) } //+kubebuilder:rbac:groups=amd.com,resources=deviceconfigs,verbs=get;list;watch;create;patch;update @@ -362,7 +362,7 @@ func (r *DeviceConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request type deviceConfigReconcilerHelperAPI interface { getRequestedDeviceConfig(ctx context.Context, namespacedName types.NamespacedName) (*amdv1alpha1.DeviceConfig, error) listDeviceConfigs(ctx context.Context) (*amdv1alpha1.DeviceConfigList, error) - buildNodeAssignments(deviceConfigList *amdv1alpha1.DeviceConfigList) error + buildNodeAssignments(ctx context.Context, deviceConfigList *amdv1alpha1.DeviceConfigList) error validateNodeAssignments(namespacedName string, nodes *v1.NodeList) error updateNodeAssignments(namespacedName string, nodes *v1.NodeList, isFinalizer bool) getDeviceConfigOwnedKMMModule(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) (*kmmv1beta1.Module, error) @@ -916,6 +916,23 @@ func (dcrh *deviceConfigReconcilerHelper) finalizeDeviceConfig(ctx context.Conte } } + // finalize DRA driver + draDS := appsv1.DaemonSet{} + namespacedName = types.NamespacedName{ + Namespace: devConfig.Namespace, + Name: devConfig.Name + utils.DRADriverNameSuffix, + } + if err := dcrh.client.Get(ctx, namespacedName, &draDS); err != nil { + if !k8serrors.IsNotFound(err) { + return fmt.Errorf("failed to get dra-driver daemonset %s: %v", namespacedName, err) + } + } else { + logger.Info("deleting dra-driver daemonset", "daemonset", namespacedName) + if err := dcrh.client.Delete(ctx, &draDS); err != nil { + return fmt.Errorf("failed to delete dra-driver daemonset %s: %v", namespacedName, err) + } + } + // finalize node labeller nlDS := appsv1.DaemonSet{} namespacedName = types.NamespacedName{ @@ -997,9 +1014,6 @@ func (dcrh *deviceConfigReconcilerHelper) finalizeDeviceConfig(ctx context.Conte logger.Error(err, "failed to update node labels") } - // Update nodeAssignments after DeviceConfig status update - dcrh.updateNodeAssignments(namespacedName.String(), nodes, true) - return nil } @@ -1526,11 +1540,13 @@ func (dcrh *deviceConfigReconcilerHelper) validateNodeAssignments(namespacedName return err } -func (dcrh *deviceConfigReconcilerHelper) buildNodeAssignments(deviceConfigList *amdv1alpha1.DeviceConfigList) error { +func (dcrh *deviceConfigReconcilerHelper) buildNodeAssignments(ctx context.Context, deviceConfigList *amdv1alpha1.DeviceConfigList) error { if deviceConfigList == nil { return nil } + logger := log.FromContext(ctx) + isReady := func(devConfig *amdv1alpha1.DeviceConfig) bool { ready := dcrh.conditionUpdater.GetReadyCondition(devConfig) if ready == nil { @@ -1552,7 +1568,8 @@ func (dcrh *deviceConfigReconcilerHelper) buildNodeAssignments(deviceConfigList } err := dcrh.validateNodeAssignments(namespacedName.String(), &v1.NodeList{Items: nodeItems}) if err != nil { - return err + logger.Error(err, "node assignment conflict detected during initialization, skipping DeviceConfig", "DeviceConfig", namespacedName) + continue } dcrh.updateNodeAssignments(namespacedName.String(), &v1.NodeList{Items: nodeItems}, false) } diff --git a/internal/controllers/device_config_reconciler_test.go b/internal/controllers/device_config_reconciler_test.go index 9a85b8ba..754f8746 100644 --- a/internal/controllers/device_config_reconciler_test.go +++ b/internal/controllers/device_config_reconciler_test.go @@ -40,6 +40,9 @@ import ( "github.com/ROCm/gpu-operator/internal/metricsexporter" "github.com/ROCm/gpu-operator/internal/testrunner" + utils "github.com/ROCm/gpu-operator/internal" + "github.com/ROCm/gpu-operator/internal/conditions" + amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" mock_client "github.com/ROCm/gpu-operator/internal/client" "github.com/ROCm/gpu-operator/internal/kmmmodule" @@ -342,6 +345,11 @@ var _ = Describe("finalizeDeviceConfig", func() { Namespace: devConfigNamespace, } + draDriverNN := types.NamespacedName{ + Name: devConfigName + utils.DRADriverNameSuffix, + Namespace: devConfigNamespace, + } + metricsNN := types.NamespacedName{ Name: devConfigName + "-" + metricsexporter.ExporterName, Namespace: devConfigNamespace, @@ -374,6 +382,7 @@ var _ = Describe("finalizeDeviceConfig", func() { } kubeClient.EXPECT().Get(ctx, devPluginNN, gomock.Any()).Return(statusErr).Times(1) + kubeClient.EXPECT().Get(ctx, draDriverNN, gomock.Any()).Return(statusErr).Times(1) kubeClient.EXPECT().Get(ctx, configmanagerNN, gomock.Any()).Return(statusErr).Times(1) kubeClient.EXPECT().Get(ctx, testrunnerNN, gomock.Any()).Return(statusErr).Times(1) kubeClient.EXPECT().Get(ctx, testNodeNN, gomock.Any()).Return(nil).Times(1) @@ -418,6 +427,7 @@ var _ = Describe("finalizeDeviceConfig", func() { kubeClient.EXPECT().Get(ctx, testNodeNN, gomock.Any()).Return(nil).Times(1), kubeClient.EXPECT().Get(ctx, metricsNN, gomock.Any()).Return(statusErr).Times(4), kubeClient.EXPECT().Get(ctx, devPluginNN, gomock.Any()).Return(statusErr).Times(1), + kubeClient.EXPECT().Get(ctx, draDriverNN, gomock.Any()).Return(statusErr).Times(1), kubeClient.EXPECT().Get(ctx, nodeLabellerNN, gomock.Any()).Return(k8serrors.NewNotFound(schema.GroupResource{}, "dsName")), kubeClient.EXPECT().Get(ctx, nn, gomock.Any()).Return(nil), kubeClient.EXPECT().Delete(ctx, gomock.Any()).Return(nil), @@ -441,6 +451,7 @@ var _ = Describe("finalizeDeviceConfig", func() { kubeClient.EXPECT().Get(ctx, testNodeNN, gomock.Any()).Return(nil).Times(1), kubeClient.EXPECT().Get(ctx, metricsNN, gomock.Any()).Return(statusErr).Times(4), kubeClient.EXPECT().Get(ctx, devPluginNN, gomock.Any()).Return(statusErr).Times(1), + kubeClient.EXPECT().Get(ctx, draDriverNN, gomock.Any()).Return(statusErr).Times(1), kubeClient.EXPECT().Get(ctx, nodeLabellerNN, gomock.Any()).Return(k8serrors.NewNotFound(schema.GroupResource{}, "dsName")), kubeClient.EXPECT().Get(ctx, nn, gomock.Any()).Return(fmt.Errorf("some error")), ) @@ -466,6 +477,7 @@ var _ = Describe("finalizeDeviceConfig", func() { kubeClient.EXPECT().Get(ctx, testNodeNN, gomock.Any()).Return(nil).Times(1), kubeClient.EXPECT().Get(ctx, metricsNN, gomock.Any()).Return(statusErr).Times(4), kubeClient.EXPECT().Get(ctx, devPluginNN, gomock.Any()).Return(statusErr).Times(1), + kubeClient.EXPECT().Get(ctx, draDriverNN, gomock.Any()).Return(statusErr).Times(1), kubeClient.EXPECT().Get(ctx, nodeLabellerNN, gomock.Any()).Return(k8serrors.NewNotFound(schema.GroupResource{}, "dsName")), kubeClient.EXPECT().Get(ctx, nn, gomock.Any()).Return(k8serrors.NewNotFound(schema.GroupResource{}, "moduleName")), kubeClient.EXPECT().Patch(ctx, expectedDevConfig, gomock.Any()).Return(nil), @@ -499,6 +511,7 @@ var _ = Describe("finalizeDeviceConfig", func() { kubeClient.EXPECT().Get(ctx, testNodeNN, gomock.Any()).Return(nil).Times(1), kubeClient.EXPECT().Get(ctx, metricsNN, gomock.Any()).Return(statusErr).Times(4), kubeClient.EXPECT().Get(ctx, devPluginNN, gomock.Any()).Return(statusErr).Times(1), + kubeClient.EXPECT().Get(ctx, draDriverNN, gomock.Any()).Return(statusErr).Times(1), kubeClient.EXPECT().Get(ctx, nodeLabellerNN, gomock.Any()).Return(k8serrors.NewNotFound(schema.GroupResource{}, "dsName")), kubeClient.EXPECT().Get(ctx, nn, gomock.Any()).Do( func(_ interface{}, _ interface{}, mod *kmmv1beta1.Module, _ ...client.GetOption) { @@ -716,3 +729,95 @@ var _ = Describe("handleNodeLabeller", func() { Expect(err).ToNot(HaveOccurred()) }) }) + +var _ = Describe("buildNodeAssignments", func() { + var ( + dcrh deviceConfigReconcilerHelperAPI + ) + + ctx := context.Background() + + makeReadyDeviceConfig := func(name, namespace string, nodes []string) amdv1alpha1.DeviceConfig { + nodeModuleStatus := map[string]amdv1alpha1.ModuleStatus{} + for _, n := range nodes { + nodeModuleStatus[n] = amdv1alpha1.ModuleStatus{} + } + return amdv1alpha1.DeviceConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Status: amdv1alpha1.DeviceConfigStatus{ + Conditions: []metav1.Condition{ + { + Type: conditions.ConditionTypeReady, + Status: metav1.ConditionTrue, + }, + }, + NodeModuleStatus: nodeModuleStatus, + }, + } + } + + BeforeEach(func() { + ctrl := gomock.NewController(GinkgoT()) + kubeClient := mock_client.NewMockClient(ctrl) + dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true) + }) + + It("skips non-ready DeviceConfigs", func() { + dcList := &amdv1alpha1.DeviceConfigList{ + Items: []amdv1alpha1.DeviceConfig{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "dc-not-ready", + Namespace: devConfigNamespace, + }, + Status: amdv1alpha1.DeviceConfigStatus{ + NodeModuleStatus: map[string]amdv1alpha1.ModuleStatus{ + "node-1": {}, + }, + }, + }, + }, + } + err := dcrh.buildNodeAssignments(ctx, dcList) + Expect(err).ToNot(HaveOccurred()) + }) + + It("logs and skips conflicting DeviceConfigs instead of returning error", func() { + dc1 := makeReadyDeviceConfig("dc-1", devConfigNamespace, []string{"node-shared"}) + dc2 := makeReadyDeviceConfig("dc-2", devConfigNamespace, []string{"node-shared"}) + + dcList := &amdv1alpha1.DeviceConfigList{ + Items: []amdv1alpha1.DeviceConfig{dc1, dc2}, + } + + // Should NOT return an error — conflicts are logged and skipped + err := dcrh.buildNodeAssignments(ctx, dcList) + Expect(err).ToNot(HaveOccurred()) + }) + + It("includes being-deleted DeviceConfigs in node assignments", func() { + now := metav1.Now() + dc := makeReadyDeviceConfig("dc-deleting", devConfigNamespace, []string{"node-owned"}) + dc.DeletionTimestamp = &now + + dcList := &amdv1alpha1.DeviceConfigList{ + Items: []amdv1alpha1.DeviceConfig{dc}, + } + + err := dcrh.buildNodeAssignments(ctx, dcList) + Expect(err).ToNot(HaveOccurred()) + + // The being-deleted DC's nodes should still be in the assignment map, + // so a second DC targeting the same node should trigger a conflict (logged, not error) + dc2 := makeReadyDeviceConfig("dc-new", devConfigNamespace, []string{"node-owned"}) + dcList2 := &amdv1alpha1.DeviceConfigList{ + Items: []amdv1alpha1.DeviceConfig{dc2}, + } + // Re-running with dc2 should log a conflict but not error + err = dcrh.buildNodeAssignments(ctx, dcList2) + Expect(err).ToNot(HaveOccurred()) + }) +}) diff --git a/internal/controllers/mock_device_config_reconciler.go b/internal/controllers/mock_device_config_reconciler.go index 10598b89..79667bcd 100644 --- a/internal/controllers/mock_device_config_reconciler.go +++ b/internal/controllers/mock_device_config_reconciler.go @@ -77,17 +77,17 @@ func (mr *MockdeviceConfigReconcilerHelperAPIMockRecorder) buildDeviceConfigStat } // buildNodeAssignments mocks base method. -func (m *MockdeviceConfigReconcilerHelperAPI) buildNodeAssignments(deviceConfigList *v1alpha1.DeviceConfigList) error { +func (m *MockdeviceConfigReconcilerHelperAPI) buildNodeAssignments(ctx context.Context, deviceConfigList *v1alpha1.DeviceConfigList) error { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "buildNodeAssignments", deviceConfigList) + ret := m.ctrl.Call(m, "buildNodeAssignments", ctx, deviceConfigList) ret0, _ := ret[0].(error) return ret0 } // buildNodeAssignments indicates an expected call of buildNodeAssignments. -func (mr *MockdeviceConfigReconcilerHelperAPIMockRecorder) buildNodeAssignments(deviceConfigList any) *gomock.Call { +func (mr *MockdeviceConfigReconcilerHelperAPIMockRecorder) buildNodeAssignments(ctx, deviceConfigList any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "buildNodeAssignments", reflect.TypeOf((*MockdeviceConfigReconcilerHelperAPI)(nil).buildNodeAssignments), deviceConfigList) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "buildNodeAssignments", reflect.TypeOf((*MockdeviceConfigReconcilerHelperAPI)(nil).buildNodeAssignments), ctx, deviceConfigList) } // deleteCondition mocks base method.