Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions internal/controllers/device_config_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ func (r *DeviceConfigReconciler) init(ctx context.Context) {
r.initErr = err
return
}
r.initErr = r.helper.buildNodeAssignments(deviceConfigList)
r.initErr = r.helper.buildNodeAssignments(ctx, deviceConfigList)
}

//+kubebuilder:rbac:groups=amd.com,resources=deviceconfigs,verbs=get;list;watch;create;patch;update
Expand Down Expand Up @@ -362,7 +362,7 @@ func (r *DeviceConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request
type deviceConfigReconcilerHelperAPI interface {
getRequestedDeviceConfig(ctx context.Context, namespacedName types.NamespacedName) (*amdv1alpha1.DeviceConfig, error)
listDeviceConfigs(ctx context.Context) (*amdv1alpha1.DeviceConfigList, error)
buildNodeAssignments(deviceConfigList *amdv1alpha1.DeviceConfigList) error
buildNodeAssignments(ctx context.Context, deviceConfigList *amdv1alpha1.DeviceConfigList) error
validateNodeAssignments(namespacedName string, nodes *v1.NodeList) error
updateNodeAssignments(namespacedName string, nodes *v1.NodeList, isFinalizer bool)
getDeviceConfigOwnedKMMModule(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) (*kmmv1beta1.Module, error)
Expand Down Expand Up @@ -916,6 +916,23 @@ func (dcrh *deviceConfigReconcilerHelper) finalizeDeviceConfig(ctx context.Conte
}
}

// finalize DRA driver
draDS := appsv1.DaemonSet{}
namespacedName = types.NamespacedName{
Namespace: devConfig.Namespace,
Name: devConfig.Name + utils.DRADriverNameSuffix,
}
if err := dcrh.client.Get(ctx, namespacedName, &draDS); err != nil {
if !k8serrors.IsNotFound(err) {
return fmt.Errorf("failed to get dra-driver daemonset %s: %v", namespacedName, err)
}
} else {
logger.Info("deleting dra-driver daemonset", "daemonset", namespacedName)
if err := dcrh.client.Delete(ctx, &draDS); err != nil {
return fmt.Errorf("failed to delete dra-driver daemonset %s: %v", namespacedName, err)
}
}

// finalize node labeller
nlDS := appsv1.DaemonSet{}
namespacedName = types.NamespacedName{
Expand Down Expand Up @@ -997,9 +1014,6 @@ func (dcrh *deviceConfigReconcilerHelper) finalizeDeviceConfig(ctx context.Conte
logger.Error(err, "failed to update node labels")
}

// Update nodeAssignments after DeviceConfig status update
dcrh.updateNodeAssignments(namespacedName.String(), nodes, true)

return nil
}

Expand Down Expand Up @@ -1526,11 +1540,13 @@ func (dcrh *deviceConfigReconcilerHelper) validateNodeAssignments(namespacedName
return err
}

func (dcrh *deviceConfigReconcilerHelper) buildNodeAssignments(deviceConfigList *amdv1alpha1.DeviceConfigList) error {
func (dcrh *deviceConfigReconcilerHelper) buildNodeAssignments(ctx context.Context, deviceConfigList *amdv1alpha1.DeviceConfigList) error {
if deviceConfigList == nil {
return nil
}

logger := log.FromContext(ctx)

isReady := func(devConfig *amdv1alpha1.DeviceConfig) bool {
ready := dcrh.conditionUpdater.GetReadyCondition(devConfig)
if ready == nil {
Expand All @@ -1552,7 +1568,8 @@ func (dcrh *deviceConfigReconcilerHelper) buildNodeAssignments(deviceConfigList
}
err := dcrh.validateNodeAssignments(namespacedName.String(), &v1.NodeList{Items: nodeItems})
if err != nil {
return err
logger.Error(err, "node assignment conflict detected during initialization, skipping DeviceConfig", "DeviceConfig", namespacedName)
continue
}
dcrh.updateNodeAssignments(namespacedName.String(), &v1.NodeList{Items: nodeItems}, false)
}
Expand Down
105 changes: 105 additions & 0 deletions internal/controllers/device_config_reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ import (
"github.com/ROCm/gpu-operator/internal/metricsexporter"
"github.com/ROCm/gpu-operator/internal/testrunner"

utils "github.com/ROCm/gpu-operator/internal"
"github.com/ROCm/gpu-operator/internal/conditions"

amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1"
mock_client "github.com/ROCm/gpu-operator/internal/client"
"github.com/ROCm/gpu-operator/internal/kmmmodule"
Expand Down Expand Up @@ -342,6 +345,11 @@ var _ = Describe("finalizeDeviceConfig", func() {
Namespace: devConfigNamespace,
}

draDriverNN := types.NamespacedName{
Name: devConfigName + utils.DRADriverNameSuffix,
Namespace: devConfigNamespace,
}

metricsNN := types.NamespacedName{
Name: devConfigName + "-" + metricsexporter.ExporterName,
Namespace: devConfigNamespace,
Expand Down Expand Up @@ -374,6 +382,7 @@ var _ = Describe("finalizeDeviceConfig", func() {
}

kubeClient.EXPECT().Get(ctx, devPluginNN, gomock.Any()).Return(statusErr).Times(1)
kubeClient.EXPECT().Get(ctx, draDriverNN, gomock.Any()).Return(statusErr).Times(1)
kubeClient.EXPECT().Get(ctx, configmanagerNN, gomock.Any()).Return(statusErr).Times(1)
kubeClient.EXPECT().Get(ctx, testrunnerNN, gomock.Any()).Return(statusErr).Times(1)
kubeClient.EXPECT().Get(ctx, testNodeNN, gomock.Any()).Return(nil).Times(1)
Expand Down Expand Up @@ -418,6 +427,7 @@ var _ = Describe("finalizeDeviceConfig", func() {
kubeClient.EXPECT().Get(ctx, testNodeNN, gomock.Any()).Return(nil).Times(1),
kubeClient.EXPECT().Get(ctx, metricsNN, gomock.Any()).Return(statusErr).Times(4),
kubeClient.EXPECT().Get(ctx, devPluginNN, gomock.Any()).Return(statusErr).Times(1),
kubeClient.EXPECT().Get(ctx, draDriverNN, gomock.Any()).Return(statusErr).Times(1),
kubeClient.EXPECT().Get(ctx, nodeLabellerNN, gomock.Any()).Return(k8serrors.NewNotFound(schema.GroupResource{}, "dsName")),
kubeClient.EXPECT().Get(ctx, nn, gomock.Any()).Return(nil),
kubeClient.EXPECT().Delete(ctx, gomock.Any()).Return(nil),
Expand All @@ -441,6 +451,7 @@ var _ = Describe("finalizeDeviceConfig", func() {
kubeClient.EXPECT().Get(ctx, testNodeNN, gomock.Any()).Return(nil).Times(1),
kubeClient.EXPECT().Get(ctx, metricsNN, gomock.Any()).Return(statusErr).Times(4),
kubeClient.EXPECT().Get(ctx, devPluginNN, gomock.Any()).Return(statusErr).Times(1),
kubeClient.EXPECT().Get(ctx, draDriverNN, gomock.Any()).Return(statusErr).Times(1),
kubeClient.EXPECT().Get(ctx, nodeLabellerNN, gomock.Any()).Return(k8serrors.NewNotFound(schema.GroupResource{}, "dsName")),
kubeClient.EXPECT().Get(ctx, nn, gomock.Any()).Return(fmt.Errorf("some error")),
)
Expand All @@ -466,6 +477,7 @@ var _ = Describe("finalizeDeviceConfig", func() {
kubeClient.EXPECT().Get(ctx, testNodeNN, gomock.Any()).Return(nil).Times(1),
kubeClient.EXPECT().Get(ctx, metricsNN, gomock.Any()).Return(statusErr).Times(4),
kubeClient.EXPECT().Get(ctx, devPluginNN, gomock.Any()).Return(statusErr).Times(1),
kubeClient.EXPECT().Get(ctx, draDriverNN, gomock.Any()).Return(statusErr).Times(1),
kubeClient.EXPECT().Get(ctx, nodeLabellerNN, gomock.Any()).Return(k8serrors.NewNotFound(schema.GroupResource{}, "dsName")),
kubeClient.EXPECT().Get(ctx, nn, gomock.Any()).Return(k8serrors.NewNotFound(schema.GroupResource{}, "moduleName")),
kubeClient.EXPECT().Patch(ctx, expectedDevConfig, gomock.Any()).Return(nil),
Expand Down Expand Up @@ -499,6 +511,7 @@ var _ = Describe("finalizeDeviceConfig", func() {
kubeClient.EXPECT().Get(ctx, testNodeNN, gomock.Any()).Return(nil).Times(1),
kubeClient.EXPECT().Get(ctx, metricsNN, gomock.Any()).Return(statusErr).Times(4),
kubeClient.EXPECT().Get(ctx, devPluginNN, gomock.Any()).Return(statusErr).Times(1),
kubeClient.EXPECT().Get(ctx, draDriverNN, gomock.Any()).Return(statusErr).Times(1),
kubeClient.EXPECT().Get(ctx, nodeLabellerNN, gomock.Any()).Return(k8serrors.NewNotFound(schema.GroupResource{}, "dsName")),
kubeClient.EXPECT().Get(ctx, nn, gomock.Any()).Do(
func(_ interface{}, _ interface{}, mod *kmmv1beta1.Module, _ ...client.GetOption) {
Expand Down Expand Up @@ -716,3 +729,95 @@ var _ = Describe("handleNodeLabeller", func() {
Expect(err).ToNot(HaveOccurred())
})
})

var _ = Describe("buildNodeAssignments", func() {
var (
dcrh deviceConfigReconcilerHelperAPI
)

ctx := context.Background()

makeReadyDeviceConfig := func(name, namespace string, nodes []string) amdv1alpha1.DeviceConfig {
nodeModuleStatus := map[string]amdv1alpha1.ModuleStatus{}
for _, n := range nodes {
nodeModuleStatus[n] = amdv1alpha1.ModuleStatus{}
}
return amdv1alpha1.DeviceConfig{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: namespace,
},
Status: amdv1alpha1.DeviceConfigStatus{
Conditions: []metav1.Condition{
{
Type: conditions.ConditionTypeReady,
Status: metav1.ConditionTrue,
},
},
NodeModuleStatus: nodeModuleStatus,
},
}
}

BeforeEach(func() {
ctrl := gomock.NewController(GinkgoT())
kubeClient := mock_client.NewMockClient(ctrl)
dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true)
})

It("skips non-ready DeviceConfigs", func() {
dcList := &amdv1alpha1.DeviceConfigList{
Items: []amdv1alpha1.DeviceConfig{
{
ObjectMeta: metav1.ObjectMeta{
Name: "dc-not-ready",
Namespace: devConfigNamespace,
},
Status: amdv1alpha1.DeviceConfigStatus{
NodeModuleStatus: map[string]amdv1alpha1.ModuleStatus{
"node-1": {},
},
},
},
},
}
err := dcrh.buildNodeAssignments(ctx, dcList)
Expect(err).ToNot(HaveOccurred())
})

It("logs and skips conflicting DeviceConfigs instead of returning error", func() {
dc1 := makeReadyDeviceConfig("dc-1", devConfigNamespace, []string{"node-shared"})
dc2 := makeReadyDeviceConfig("dc-2", devConfigNamespace, []string{"node-shared"})

dcList := &amdv1alpha1.DeviceConfigList{
Items: []amdv1alpha1.DeviceConfig{dc1, dc2},
}

// Should NOT return an error — conflicts are logged and skipped
err := dcrh.buildNodeAssignments(ctx, dcList)
Expect(err).ToNot(HaveOccurred())
})

It("includes being-deleted DeviceConfigs in node assignments", func() {
now := metav1.Now()
dc := makeReadyDeviceConfig("dc-deleting", devConfigNamespace, []string{"node-owned"})
dc.DeletionTimestamp = &now

dcList := &amdv1alpha1.DeviceConfigList{
Items: []amdv1alpha1.DeviceConfig{dc},
}

err := dcrh.buildNodeAssignments(ctx, dcList)
Expect(err).ToNot(HaveOccurred())

// The being-deleted DC's nodes should still be in the assignment map,
// so a second DC targeting the same node should trigger a conflict (logged, not error)
dc2 := makeReadyDeviceConfig("dc-new", devConfigNamespace, []string{"node-owned"})
dcList2 := &amdv1alpha1.DeviceConfigList{
Items: []amdv1alpha1.DeviceConfig{dc2},
}
// Re-running with dc2 should log a conflict but not error
err = dcrh.buildNodeAssignments(ctx, dcList2)
Expect(err).ToNot(HaveOccurred())
})
})
8 changes: 4 additions & 4 deletions internal/controllers/mock_device_config_reconciler.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.