Skip to content

Commit 926700c

Browse files
committed
Handle CnsNotRegisteredFault by re-registering volumes in CNS operations
This commit adds handling for CnsNotRegisteredFault in various CNS volume operations for the WORKLOAD cluster flavor. When a volume operation fails with CnsNotRegisteredFault, the driver now attempts to re-register the volume with CNS and retries the operation. Changes include: - Add clusterId and clusterDistribution parameters to GetManager() for volume re-registration - Add ReRegisterVolume() public method to re-register unregistered volumes - Add IsCnsNotRegisteredFault() helper to detect the fault type - Add IsCnsVolumeAlreadyExistsFault() helper for idempotent re-registration - Handle CnsNotRegisteredFault in: - AttachVolume - DetachVolume - DeleteVolume (with improved idempotency) - UpdateVolumeMetadata - UpdateVolumeCrypto - ExpandVolume (with improved idempotency) - CreateSnapshot (with improved idempotency and with transaction) - DeleteSnapshot - RelocateVolume (in migrationController) The re-registration is only attempted once per operation to prevent infinite loops. If re-registration fails or the retry fails, the original error is returned.
1 parent 8f5b046 commit 926700c

File tree

22 files changed

+679
-107
lines changed

22 files changed

+679
-107
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ require (
2424
github.com/stretchr/testify v1.11.1
2525
github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250923172217-bf5a74e51c65
2626
github.com/vmware-tanzu/vm-operator/external/byok v0.0.0-20250509154507-b93e51fc90fa
27-
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203163802-5ce652387dac
27+
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203213634-99f18b71ea8e
2828
go.uber.org/zap v1.27.1
2929
golang.org/x/crypto v0.46.0
3030
golang.org/x/sync v0.19.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,8 +315,8 @@ github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250923172217-bf5a74e51c65 h1:
315315
github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250923172217-bf5a74e51c65/go.mod h1:nWTPpxfe4gHuuYuFcrs86+NMxfkqPk3a3IlvI8TCWak=
316316
github.com/vmware-tanzu/vm-operator/external/byok v0.0.0-20250509154507-b93e51fc90fa h1:4MKu14YJ7J54O6QKmT4ds5EUpysWLLtQRMff73cVkmU=
317317
github.com/vmware-tanzu/vm-operator/external/byok v0.0.0-20250509154507-b93e51fc90fa/go.mod h1:8tiuyYslzjLIUmOlXZuGKQdQP2ZgWGCVhVeyptmZYnk=
318-
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203163802-5ce652387dac h1:E3W+2J1I0B5LyIillKYVQHIb6CpslGcogt7Q+8FHT3c=
319-
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203163802-5ce652387dac/go.mod h1:FM3GTg002dFFN7l2/hNS0YWC4f78HTw4kvgUwAE52cM=
318+
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203213634-99f18b71ea8e h1:TG9xuPu9N29Ak1gNs85VsMImNv1bd2l0yNfAMc3imOU=
319+
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203213634-99f18b71ea8e/go.mod h1:FM3GTg002dFFN7l2/hNS0YWC4f78HTw4kvgUwAE52cM=
320320
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
321321
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
322322
github.com/xiang90/probing v0.0.0-20221125231312-a49e3df8f510 h1:S2dVYn90KE98chqDkyE9Z4N61UnQd+KOfgp5Iu53llk=

pkg/common/cns-lib/volume/manager.go

Lines changed: 298 additions & 18 deletions
Large diffs are not rendered by default.

pkg/common/cns-lib/volume/manager_mock.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,3 +199,11 @@ func (m MockManager) SyncVolume(ctx context.Context, syncVolumeSpecs []cnstypes.
199199
//TODO implement me
200200
panic("implement me")
201201
}
202+
203+
func (m MockManager) ReRegisterVolume(ctx context.Context, volumeID string) error {
204+
if m.failRequest {
205+
return m.err
206+
}
207+
208+
return nil
209+
}

pkg/common/cns-lib/volume/manager_test.go

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/stretchr/testify/assert"
1010
cnstypes "github.com/vmware/govmomi/cns/types"
1111
vim25types "github.com/vmware/govmomi/vim25/types"
12+
"sigs.k8s.io/vsphere-csi-driver/v3/pkg/csi/service/logger"
1213
)
1314

1415
const createVolumeTaskTimeout = 3 * time.Second
@@ -284,3 +285,248 @@ func TestAttachVolumeFaultHandling(t *testing.T) {
284285
})
285286
}
286287
}
288+
289+
// TestIsCnsNotRegisteredFault tests the IsCnsNotRegisteredFault helper function
290+
// that detects CnsNotRegisteredFault from CNS operations.
291+
func TestIsCnsNotRegisteredFault(t *testing.T) {
292+
ctx := logger.NewContextWithLogger(context.Background())
293+
294+
tests := []struct {
295+
name string
296+
fault *vim25types.LocalizedMethodFault
297+
expected bool
298+
}{
299+
{
300+
name: "nil fault",
301+
fault: nil,
302+
expected: false,
303+
},
304+
{
305+
name: "fault with nil Fault field",
306+
fault: &vim25types.LocalizedMethodFault{
307+
LocalizedMessage: "Some error",
308+
Fault: nil,
309+
},
310+
expected: false,
311+
},
312+
{
313+
name: "CnsNotRegisteredFault",
314+
fault: &vim25types.LocalizedMethodFault{
315+
LocalizedMessage: "The input volume is not registered as a CNS volume",
316+
Fault: &cnstypes.CnsNotRegisteredFault{},
317+
},
318+
expected: true,
319+
},
320+
{
321+
name: "CnsFault (not CnsNotRegisteredFault)",
322+
fault: &vim25types.LocalizedMethodFault{
323+
LocalizedMessage: "Generic CNS fault",
324+
Fault: &cnstypes.CnsFault{
325+
Reason: "Generic error",
326+
},
327+
},
328+
expected: false,
329+
},
330+
{
331+
name: "ResourceInUse fault",
332+
fault: &vim25types.LocalizedMethodFault{
333+
LocalizedMessage: "Resource is in use",
334+
Fault: &vim25types.ResourceInUse{
335+
Type: "Disk",
336+
Name: "test-disk",
337+
},
338+
},
339+
expected: false,
340+
},
341+
{
342+
name: "NotSupported fault",
343+
fault: &vim25types.LocalizedMethodFault{
344+
LocalizedMessage: "Operation not supported",
345+
Fault: &vim25types.NotSupported{},
346+
},
347+
expected: false,
348+
},
349+
}
350+
351+
for _, tt := range tests {
352+
t.Run(tt.name, func(t *testing.T) {
353+
result := IsCnsNotRegisteredFault(ctx, tt.fault)
354+
assert.Equal(t, tt.expected, result,
355+
"IsCnsNotRegisteredFault(%v) = %v, expected %v", tt.fault, result, tt.expected)
356+
})
357+
}
358+
}
359+
360+
// TestIsCnsVolumeAlreadyExistsFault tests the IsCnsVolumeAlreadyExistsFault helper function
361+
// that checks if a fault type indicates the volume already exists in CNS.
362+
func TestIsCnsVolumeAlreadyExistsFault(t *testing.T) {
363+
ctx := logger.NewContextWithLogger(context.Background())
364+
365+
tests := []struct {
366+
name string
367+
faultType string
368+
expected bool
369+
}{
370+
{
371+
name: "CnsVolumeAlreadyExistsFault",
372+
faultType: "vim.fault.CnsVolumeAlreadyExistsFault",
373+
expected: true,
374+
},
375+
{
376+
name: "empty fault type",
377+
faultType: "",
378+
expected: false,
379+
},
380+
{
381+
name: "CnsFault",
382+
faultType: "vim.fault.CnsFault",
383+
expected: false,
384+
},
385+
{
386+
name: "NotSupported",
387+
faultType: "vim25:NotSupported",
388+
expected: false,
389+
},
390+
{
391+
name: "partial match - should not match",
392+
faultType: "CnsVolumeAlreadyExistsFault",
393+
expected: false,
394+
},
395+
{
396+
name: "case sensitive - lowercase should not match",
397+
faultType: "vim.fault.cnsvolumealreadyexistsfault",
398+
expected: false,
399+
},
400+
}
401+
402+
for _, tt := range tests {
403+
t.Run(tt.name, func(t *testing.T) {
404+
result := IsCnsVolumeAlreadyExistsFault(ctx, tt.faultType)
405+
assert.Equal(t, tt.expected, result,
406+
"IsCnsVolumeAlreadyExistsFault(%q) = %v, expected %v", tt.faultType, result, tt.expected)
407+
})
408+
}
409+
}
410+
411+
// TestCnsNotRegisteredFaultHandlingScenarios tests the scenarios where
412+
// CnsNotRegisteredFault should trigger re-registration of the volume.
413+
func TestCnsNotRegisteredFaultHandlingScenarios(t *testing.T) {
414+
ctx := logger.NewContextWithLogger(context.Background())
415+
416+
tests := []struct {
417+
name string
418+
volumeOperationResult *cnstypes.CnsVolumeOperationResult
419+
expectReRegistrationNeeded bool
420+
description string
421+
}{
422+
{
423+
name: "CnsNotRegisteredFault should trigger re-registration",
424+
volumeOperationResult: &cnstypes.CnsVolumeOperationResult{
425+
VolumeId: cnstypes.CnsVolumeId{Id: "test-volume-1"},
426+
Fault: &vim25types.LocalizedMethodFault{
427+
LocalizedMessage: "The input volume is not registered as a CNS volume",
428+
Fault: &cnstypes.CnsNotRegisteredFault{},
429+
},
430+
},
431+
expectReRegistrationNeeded: true,
432+
description: "Volume not registered in CNS should trigger re-registration",
433+
},
434+
{
435+
name: "CnsFault should not trigger re-registration",
436+
volumeOperationResult: &cnstypes.CnsVolumeOperationResult{
437+
VolumeId: cnstypes.CnsVolumeId{Id: "test-volume-2"},
438+
Fault: &vim25types.LocalizedMethodFault{
439+
LocalizedMessage: "Generic CNS error",
440+
Fault: &cnstypes.CnsFault{
441+
Reason: "Some other error",
442+
},
443+
},
444+
},
445+
expectReRegistrationNeeded: false,
446+
description: "Generic CnsFault should not trigger re-registration",
447+
},
448+
{
449+
name: "No fault should not trigger re-registration",
450+
volumeOperationResult: &cnstypes.CnsVolumeOperationResult{
451+
VolumeId: cnstypes.CnsVolumeId{Id: "test-volume-3"},
452+
Fault: nil,
453+
},
454+
expectReRegistrationNeeded: false,
455+
description: "Successful operation should not trigger re-registration",
456+
},
457+
{
458+
name: "ResourceInUse should not trigger re-registration",
459+
volumeOperationResult: &cnstypes.CnsVolumeOperationResult{
460+
VolumeId: cnstypes.CnsVolumeId{Id: "test-volume-4"},
461+
Fault: &vim25types.LocalizedMethodFault{
462+
LocalizedMessage: "Resource is in use",
463+
Fault: &vim25types.ResourceInUse{
464+
Type: "Disk",
465+
Name: "test-disk",
466+
},
467+
},
468+
},
469+
expectReRegistrationNeeded: false,
470+
description: "ResourceInUse should not trigger re-registration",
471+
},
472+
}
473+
474+
for _, tt := range tests {
475+
t.Run(tt.name, func(t *testing.T) {
476+
var reRegistrationNeeded bool
477+
if tt.volumeOperationResult.Fault != nil {
478+
reRegistrationNeeded = IsCnsNotRegisteredFault(ctx, tt.volumeOperationResult.Fault)
479+
}
480+
assert.Equal(t, tt.expectReRegistrationNeeded, reRegistrationNeeded,
481+
"Test: %s - %s", tt.name, tt.description)
482+
})
483+
}
484+
}
485+
486+
// TestReRegistrationIdempotency tests that re-registration handles
487+
// CnsVolumeAlreadyExistsFault gracefully (idempotent behavior).
488+
func TestReRegistrationIdempotency(t *testing.T) {
489+
ctx := logger.NewContextWithLogger(context.Background())
490+
491+
tests := []struct {
492+
name string
493+
faultTypeAfterReRegister string
494+
expectSuccess bool
495+
description string
496+
}{
497+
{
498+
name: "Re-registration succeeds",
499+
faultTypeAfterReRegister: "",
500+
expectSuccess: true,
501+
description: "Re-registration with no fault should succeed",
502+
},
503+
{
504+
name: "Volume already exists (idempotent)",
505+
faultTypeAfterReRegister: "vim.fault.CnsVolumeAlreadyExistsFault",
506+
expectSuccess: true,
507+
description: "CnsVolumeAlreadyExistsFault should be treated as success",
508+
},
509+
{
510+
name: "Other fault during re-registration",
511+
faultTypeAfterReRegister: "vim.fault.CnsFault",
512+
expectSuccess: false,
513+
description: "Other faults should be treated as failure",
514+
},
515+
}
516+
517+
for _, tt := range tests {
518+
t.Run(tt.name, func(t *testing.T) {
519+
var success bool
520+
if tt.faultTypeAfterReRegister == "" {
521+
success = true
522+
} else if IsCnsVolumeAlreadyExistsFault(ctx, tt.faultTypeAfterReRegister) {
523+
// CnsVolumeAlreadyExistsFault means volume is already registered
524+
success = true
525+
} else {
526+
success = false
527+
}
528+
assert.Equal(t, tt.expectSuccess, success,
529+
"Test: %s - %s", tt.name, tt.description)
530+
})
531+
}
532+
}

pkg/common/cns-lib/volume/util.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -632,3 +632,17 @@ func IsCnsVolumeAlreadyExistsFault(ctx context.Context, faultType string) bool {
632632
log.Infof("Checking fault type: %q is vim.fault.CnsVolumeAlreadyExistsFault", faultType)
633633
return faultType == "vim.fault.CnsVolumeAlreadyExistsFault"
634634
}
635+
636+
// IsCnsNotRegisteredFault checks if the fault is CnsNotRegisteredFault
637+
func IsCnsNotRegisteredFault(ctx context.Context, fault *types.LocalizedMethodFault) bool {
638+
log := logger.GetLogger(ctx)
639+
if fault == nil || fault.Fault == nil {
640+
log.Infof("fault is nil or fault.Fault is nil. Not a CnsNotRegisteredFault")
641+
return false
642+
}
643+
if _, ok := fault.Fault.(*cnstypes.CnsNotRegisteredFault); ok {
644+
log.Infof("observed CnsNotRegisteredFault")
645+
return true
646+
}
647+
return false
648+
}

pkg/common/unittestcommon/types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,3 +224,7 @@ func (m *MockVolumeManager) SyncVolume(ctx context.Context,
224224
syncVolumeSpecs []cnstypes.CnsSyncVolumeSpec) (string, error) {
225225
return "", nil
226226
}
227+
228+
func (m *MockVolumeManager) ReRegisterVolume(ctx context.Context, volumeID string) error {
229+
return nil
230+
}

pkg/common/utils/utils_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ func getCommonUtilsTest(t *testing.T) *commonUtilsTest {
139139
t.Fatal(err)
140140
}
141141

142-
volumeManager, err := cnsvolumes.GetManager(ctx, virtualCenter, nil, false, false, false, "")
142+
volumeManager, err := cnsvolumes.GetManager(ctx, virtualCenter, nil, false, false, false, "", "", "")
143143
if err != nil {
144144
t.Fatalf("failed to create an instance of volume manager. err=%v", err)
145145
}

pkg/csi/service/common/vsphereutil.go

Lines changed: 10 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1226,31 +1226,20 @@ func DeleteSnapshotUtil(ctx context.Context, volumeManager cnsvolume.Manager, cs
12261226
}
12271227

12281228
// GetCnsVolumeType is the helper function that determines the volume type based on the volume-id
1229-
func GetCnsVolumeType(ctx context.Context, volumeManager cnsvolume.Manager, volumeId string) (string, error) {
1229+
// GetCnsVolumeType is the helper function that determines the volume type based on the volume-id prefix.
1230+
// If volume ID begins with "file:", it is a file volume, otherwise it is a block volume.
1231+
func GetCnsVolumeType(ctx context.Context, volumeId string) string {
12301232
log := logger.GetLogger(ctx)
12311233
var volumeType string
1232-
queryFilter := cnstypes.CnsQueryFilter{
1233-
VolumeIds: []cnstypes.CnsVolumeId{{Id: volumeId}},
1234-
}
1235-
querySelection := cnstypes.CnsQuerySelection{
1236-
Names: []string{
1237-
string(cnstypes.QuerySelectionNameTypeVolumeType),
1238-
},
1239-
}
1240-
// Select only the volume type.
1241-
queryResult, err := volumeManager.QueryAllVolume(ctx, queryFilter, querySelection)
1242-
if err != nil {
1243-
return "", logger.LogNewErrorCodef(log, codes.Internal,
1244-
"queryVolume failed for volumeID: %q with err=%+v", volumeId, err)
1234+
// Determine volume type based on volume ID prefix
1235+
if strings.HasPrefix(volumeId, "file:") {
1236+
volumeType = FileVolumeType
1237+
} else {
1238+
volumeType = BlockVolumeType
12451239
}
12461240

1247-
if len(queryResult.Volumes) == 0 {
1248-
log.Infof("volume: %s not found during query while determining CNS volume type", volumeId)
1249-
return "", ErrNotFound
1250-
}
1251-
volumeType = queryResult.Volumes[0].VolumeType
1252-
log.Infof("volume: %s is of type: %s", volumeId, volumeType)
1253-
return volumeType, nil
1241+
log.Infof("volume: %s is of type: %s (determined from volume ID prefix)", volumeId, volumeType)
1242+
return volumeType
12541243
}
12551244

12561245
// GetNodeVMsWithAccessToDatastore finds out NodeVMs which have access to the given

pkg/csi/service/common/vsphereutil_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,11 @@ func (m *mockVolumeManager) SyncVolume(ctx context.Context,
131131
syncVolumeSpecs []cnstypes.CnsSyncVolumeSpec) (string, error) {
132132
return "", nil
133133
}
134+
135+
func (m *mockVolumeManager) ReRegisterVolume(ctx context.Context, volumeID string) error {
136+
return nil
137+
}
138+
134139
func TestQueryVolumeSnapshotsByVolumeIDWithQuerySnapshotsCnsVolumeNotFoundFault(t *testing.T) {
135140
// Skip test on ARM64 due to gomonkey limitations
136141
if runtime.GOARCH == "arm64" {

0 commit comments

Comments
 (0)