Skip to content

Commit 2206fdb

Browse files
committed
Handle CnsNotRegisteredFault by re-registering volumes in CNS operations
This commit adds handling for CnsNotRegisteredFault in various CNS volume operations for the WORKLOAD cluster flavor. When a volume operation fails with CnsNotRegisteredFault, the driver now attempts to re-register the volume with CNS and retries the operation. Changes include: - Add clusterId and clusterDistribution parameters to GetManager() for volume re-registration - Add reRegisterVolume() function to re-register unregistered volumes - Add IsCnsNotRegisteredFault() helper to detect the fault type - Add IsCnsVolumeAlreadyExistsFault() helper for idempotent re-registration - Handle CnsNotRegisteredFault in: - AttachVolume - DetachVolume - DeleteVolume (with improved idempotency) - UpdateVolumeMetadata - UpdateVolumeCrypto - ExpandVolume (with improved idempotency) - CreateSnapshot (with improved idempotency and with transaction) - DeleteSnapshot The re-registration is only attempted once per operation to prevent infinite loops. If re-registration fails or the retry fails, the original error is returned.
1 parent 1c0ad2a commit 2206fdb

17 files changed

Lines changed: 387 additions & 136 deletions

File tree

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ require (
2424
github.com/stretchr/testify v1.11.1
2525
github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250923172217-bf5a74e51c65
2626
github.com/vmware-tanzu/vm-operator/external/byok v0.0.0-20250509154507-b93e51fc90fa
27-
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203163802-5ce652387dac
27+
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203213634-99f18b71ea8e
2828
go.uber.org/zap v1.27.1
2929
golang.org/x/crypto v0.46.0
3030
golang.org/x/sync v0.19.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,8 +315,8 @@ github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250923172217-bf5a74e51c65 h1:
315315
github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250923172217-bf5a74e51c65/go.mod h1:nWTPpxfe4gHuuYuFcrs86+NMxfkqPk3a3IlvI8TCWak=
316316
github.com/vmware-tanzu/vm-operator/external/byok v0.0.0-20250509154507-b93e51fc90fa h1:4MKu14YJ7J54O6QKmT4ds5EUpysWLLtQRMff73cVkmU=
317317
github.com/vmware-tanzu/vm-operator/external/byok v0.0.0-20250509154507-b93e51fc90fa/go.mod h1:8tiuyYslzjLIUmOlXZuGKQdQP2ZgWGCVhVeyptmZYnk=
318-
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203163802-5ce652387dac h1:E3W+2J1I0B5LyIillKYVQHIb6CpslGcogt7Q+8FHT3c=
319-
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203163802-5ce652387dac/go.mod h1:FM3GTg002dFFN7l2/hNS0YWC4f78HTw4kvgUwAE52cM=
318+
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203213634-99f18b71ea8e h1:TG9xuPu9N29Ak1gNs85VsMImNv1bd2l0yNfAMc3imOU=
319+
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203213634-99f18b71ea8e/go.mod h1:FM3GTg002dFFN7l2/hNS0YWC4f78HTw4kvgUwAE52cM=
320320
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
321321
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
322322
github.com/xiang90/probing v0.0.0-20221125231312-a49e3df8f510 h1:S2dVYn90KE98chqDkyE9Z4N61UnQd+KOfgp5Iu53llk=

pkg/common/cns-lib/volume/manager.go

Lines changed: 300 additions & 18 deletions
Large diffs are not rendered by default.

pkg/common/cns-lib/volume/util.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -632,3 +632,17 @@ func IsCnsVolumeAlreadyExistsFault(ctx context.Context, faultType string) bool {
632632
log.Infof("Checking fault type: %q is vim.fault.CnsVolumeAlreadyExistsFault", faultType)
633633
return faultType == "vim.fault.CnsVolumeAlreadyExistsFault"
634634
}
635+
636+
// IsCnsNotRegisteredFault checks if the fault is CnsNotRegisteredFault
637+
func IsCnsNotRegisteredFault(ctx context.Context, fault *types.LocalizedMethodFault) bool {
638+
log := logger.GetLogger(ctx)
639+
if fault == nil || fault.Fault == nil {
640+
log.Infof("fault is nil or fault.Fault is nil. Not a CnsNotRegisteredFault")
641+
return false
642+
}
643+
if _, ok := fault.Fault.(*cnstypes.CnsNotRegisteredFault); ok {
644+
log.Infof("observed CnsNotRegisteredFault")
645+
return true
646+
}
647+
return false
648+
}

pkg/common/utils/utils_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ func getCommonUtilsTest(t *testing.T) *commonUtilsTest {
139139
t.Fatal(err)
140140
}
141141

142-
volumeManager, err := cnsvolumes.GetManager(ctx, virtualCenter, nil, false, false, false, "")
142+
volumeManager, err := cnsvolumes.GetManager(ctx, virtualCenter, nil, false, false, false, "", "", "")
143143
if err != nil {
144144
t.Fatalf("failed to create an instance of volume manager. err=%v", err)
145145
}

pkg/csi/service/common/vsphereutil.go

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,32 +1225,21 @@ func DeleteSnapshotUtil(ctx context.Context, volumeManager cnsvolume.Manager, cs
12251225
return cnsSnapshotInfo, nil
12261226
}
12271227

1228-
// GetCnsVolumeType is the helper function that determines the volume type based on the volume-id
1229-
func GetCnsVolumeType(ctx context.Context, volumeManager cnsvolume.Manager, volumeId string) (string, error) {
1228+
// GetCnsVolumeType is the helper function that determines the volume type based on the volume-id prefix.
1229+
// If volume ID begins with "file:", it is a file volume, otherwise it is a block volume.
1230+
func GetCnsVolumeType(ctx context.Context, volumeId string) string {
12301231
log := logger.GetLogger(ctx)
12311232
var volumeType string
1232-
queryFilter := cnstypes.CnsQueryFilter{
1233-
VolumeIds: []cnstypes.CnsVolumeId{{Id: volumeId}},
1234-
}
1235-
querySelection := cnstypes.CnsQuerySelection{
1236-
Names: []string{
1237-
string(cnstypes.QuerySelectionNameTypeVolumeType),
1238-
},
1239-
}
1240-
// Select only the volume type.
1241-
queryResult, err := volumeManager.QueryAllVolume(ctx, queryFilter, querySelection)
1242-
if err != nil {
1243-
return "", logger.LogNewErrorCodef(log, codes.Internal,
1244-
"queryVolume failed for volumeID: %q with err=%+v", volumeId, err)
1245-
}
12461233

1247-
if len(queryResult.Volumes) == 0 {
1248-
log.Infof("volume: %s not found during query while determining CNS volume type", volumeId)
1249-
return "", ErrNotFound
1234+
// Determine volume type based on volume ID prefix
1235+
if strings.HasPrefix(volumeId, "file:") {
1236+
volumeType = FileVolumeType
1237+
} else {
1238+
volumeType = BlockVolumeType
12501239
}
1251-
volumeType = queryResult.Volumes[0].VolumeType
1252-
log.Infof("volume: %s is of type: %s", volumeId, volumeType)
1253-
return volumeType, nil
1240+
1241+
log.Infof("volume: %s is of type: %s (determined from volume ID prefix)", volumeId, volumeType)
1242+
return volumeType
12541243
}
12551244

12561245
// GetNodeVMsWithAccessToDatastore finds out NodeVMs which have access to the given

pkg/csi/service/vanilla/controller.go

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,8 @@ func (c *controller) Init(config *cnsconfig.Config, version string) error {
232232
c.managers.VcenterConfigs[vcenterconfig.Host] = vcenterconfig
233233
volumeManager, err := cnsvolume.GetManager(ctx, vcenter,
234234
operationStore, true, true,
235-
multivCenterTopologyDeployment, cnstypes.CnsClusterFlavorVanilla)
235+
multivCenterTopologyDeployment, cnstypes.CnsClusterFlavorVanilla, config.Global.ClusterID,
236+
config.Global.ClusterDistribution)
236237
if err != nil {
237238
return logger.LogNewErrorf(log, "failed to create an instance of volume manager. err=%v", err)
238239
}
@@ -1617,16 +1618,7 @@ func (c *controller) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequ
16171618
}
16181619

16191620
if cnsVolumeType == common.UnknownVolumeType {
1620-
cnsVolumeType, err = common.GetCnsVolumeType(ctx, volumeManager, req.VolumeId)
1621-
if err != nil {
1622-
if err.Error() == common.ErrNotFound.Error() {
1623-
// The volume couldn't be found during query, assuming the delete operation as success
1624-
return &csi.DeleteVolumeResponse{}, "", nil
1625-
} else {
1626-
return nil, csifault.CSIInternalFault, logger.LogNewErrorCodef(log, codes.Internal,
1627-
"failed to determine CNS volume type for volume: %q. Error: %+v", req.VolumeId, err)
1628-
}
1629-
}
1621+
cnsVolumeType = common.GetCnsVolumeType(ctx, req.VolumeId)
16301622
volumeType = convertCnsVolumeType(ctx, cnsVolumeType)
16311623
}
16321624
// Check if the volume contains CNS snapshots only for block volumes.

pkg/csi/service/vanilla/controller_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ func getControllerTest(t *testing.T) *controllerTest {
243243

244244
volumeManager, err := cnsvolume.GetManager(ctx, vcenter,
245245
fakeOpStore, true, false,
246-
false, cnstypes.CnsClusterFlavorVanilla)
246+
false, cnstypes.CnsClusterFlavorVanilla, "", "")
247247
if err != nil {
248248
t.Fatalf("failed to create an instance of volume manager. err=%v", err)
249249
}

pkg/csi/service/vanilla/controller_topology_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ func getControllerTestWithTopology(t *testing.T) *controllerTestTopology {
415415

416416
volumeManager, err := cnsvolume.GetManager(ctxtopology, vcenter,
417417
fakeOpStore, true, false,
418-
false, cnstypes.CnsClusterFlavorVanilla)
418+
false, cnstypes.CnsClusterFlavorVanilla, "", "")
419419
if err != nil {
420420
t.Fatalf("failed to create an instance of volume manager. err=%v", err)
421421
}

pkg/csi/service/wcp/controller.go

Lines changed: 23 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ func (c *controller) Init(config *cnsconfig.Config, version string) error {
207207

208208
volumeManager, err := cnsvolume.GetManager(ctx, vcenter, operationStore,
209209
idempotencyHandlingEnabled, false,
210-
false, cnstypes.CnsClusterFlavorWorkload)
210+
false, cnstypes.CnsClusterFlavorWorkload, config.Global.SupervisorID, config.Global.ClusterDistribution)
211211
if err != nil {
212212
return logger.LogNewErrorf(log, "failed to create an instance of volume manager. err=%v", err)
213213
}
@@ -450,10 +450,9 @@ func (c *controller) ReloadConfiguration(reconnectToVCFromNewConfig bool) error
450450
return logger.LogNewErrorf(log, "failed to reset volume manager. err=%v", err)
451451
}
452452
c.manager.VcenterConfig = newVCConfig
453-
454453
volumeManager, err := cnsvolume.GetManager(ctx, vcenter, operationStore,
455454
idempotencyHandlingEnabled, false,
456-
false, cnstypes.CnsClusterFlavorWorkload)
455+
false, cnstypes.CnsClusterFlavorWorkload, cfg.Global.SupervisorID, cfg.Global.ClusterDistribution)
457456
if err != nil {
458457
return logger.LogNewErrorf(log, "failed to create an instance of volume manager. err=%v", err)
459458
}
@@ -1792,16 +1791,7 @@ func (c *controller) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequ
17921791
return nil, csifault.CSIInvalidArgumentFault, err
17931792
}
17941793
if cnsVolumeType == common.UnknownVolumeType {
1795-
cnsVolumeType, err = common.GetCnsVolumeType(ctx, c.manager.VolumeManager, req.VolumeId)
1796-
if err != nil {
1797-
if err.Error() == common.ErrNotFound.Error() {
1798-
// The volume couldn't be found during query, assuming the delete operation as success
1799-
return &csi.DeleteVolumeResponse{}, "", nil
1800-
} else {
1801-
return nil, csifault.CSIInternalFault, logger.LogNewErrorCodef(log, codes.Internal,
1802-
"failed to determine CNS volume type for volume: %q. Error: %+v", req.VolumeId, err)
1803-
}
1804-
}
1794+
cnsVolumeType = common.GetCnsVolumeType(ctx, req.VolumeId)
18051795
volumeType = convertCnsVolumeType(ctx, cnsVolumeType)
18061796
}
18071797
// Check if the volume contains CNS snapshots only for block volumes.
@@ -2503,66 +2493,21 @@ func (c *controller) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshot
25032493
}
25042494
volumeID := req.GetSourceVolumeId()
25052495
volumeType = prometheus.PrometheusBlockVolumeType
2506-
// Query capacity in MB for block volume snapshot
2507-
volumeIds := []cnstypes.CnsVolumeId{{Id: volumeID}}
2508-
cnsVolumeDetailsMap, err := utils.QueryVolumeDetailsUtil(ctx, c.manager.VolumeManager, volumeIds)
2509-
if err != nil {
2510-
return nil, err
2511-
}
2512-
if _, ok := cnsVolumeDetailsMap[volumeID]; !ok {
2513-
return nil, logger.LogNewErrorCodef(log, codes.Internal,
2514-
"cns query volume did not return the volume: %s", volumeID)
2515-
}
2516-
snapshotSizeInMB := cnsVolumeDetailsMap[volumeID].SizeInMB
25172496

2518-
if cnsVolumeDetailsMap[volumeID].VolumeType != common.BlockVolumeType {
2497+
cnsvolumeType := common.GetCnsVolumeType(ctx, volumeID)
2498+
if cnsvolumeType != common.BlockVolumeType {
25192499
return nil, logger.LogNewErrorCodef(log, codes.FailedPrecondition,
2520-
"queried volume doesn't have the expected volume type. Expected VolumeType: %v. "+
2521-
"Queried VolumeType: %v", volumeType, cnsVolumeDetailsMap[volumeID].VolumeType)
2522-
}
2523-
2524-
// Extract namespace from request parameters
2525-
volumeSnapshotNamespace := req.Parameters[common.VolumeSnapshotNamespaceKey]
2526-
if volumeSnapshotNamespace == "" {
2527-
return nil, logger.LogNewErrorCodef(log, codes.Internal,
2528-
"volumesnapshot namespace is not set in the request parameters")
2529-
}
2530-
2531-
// Get snapshot limit from namespace ConfigMap
2532-
snapshotLimit, err := getSnapshotLimitForNamespace(ctx, volumeSnapshotNamespace)
2533-
if err != nil {
2534-
return nil, logger.LogNewErrorCodef(log, codes.Internal,
2535-
"failed to get snapshot limit for namespace %q: %v", volumeSnapshotNamespace, err)
2500+
"Expected VolumeType: %v. "+
2501+
"Observed VolumeType: %v", volumeType, cnsvolumeType)
25362502
}
2537-
log.Infof("Snapshot limit for namespace %q is set to %d", volumeSnapshotNamespace, snapshotLimit)
2538-
2539-
// Acquire lock for this volume to serialize snapshot operations
2540-
c.acquireSnapshotLock(ctx, volumeID)
2541-
defer c.releaseSnapshotLock(ctx, volumeID)
2542-
2543-
// Query existing snapshots for this volume
2544-
snapshotList, _, err := common.QueryVolumeSnapshotsByVolumeID(ctx, c.manager.VolumeManager, volumeID,
2545-
common.QuerySnapshotLimit)
2546-
if err != nil {
2547-
return nil, logger.LogNewErrorCodef(log, codes.Internal,
2548-
"failed to query snapshots for volume %q: %v", volumeID, err)
2549-
}
2550-
2551-
// Check if the limit is exceeded
2552-
currentSnapshotCount := len(snapshotList)
2553-
if currentSnapshotCount >= snapshotLimit {
2554-
return nil, logger.LogNewErrorCodef(log, codes.FailedPrecondition,
2555-
"the number of snapshots (%d) on the source volume %s has reached or exceeded "+
2556-
"the configured maximum (%d) for namespace %s",
2557-
currentSnapshotCount, volumeID, snapshotLimit, volumeSnapshotNamespace)
2558-
}
2559-
log.Infof("Current snapshot count for volume %q is %d, within limit of %d",
2560-
volumeID, currentSnapshotCount, snapshotLimit)
2503+
// TODO: We may need to add logic to check the limit of max number of snapshots by using
2504+
// GlobalMaxSnapshotsPerBlockVolume etc. variables in the future.
25612505

25622506
// the returned snapshotID below is a combination of CNS VolumeID and CNS SnapshotID concatenated by the "+"
25632507
// sign. That is, a string of "<UUID>+<UUID>". Because, all other CNS snapshot APIs still require both
25642508
// VolumeID and SnapshotID as the input, while corresponding snapshot APIs in upstream CSI require SnapshotID.
25652509
// So, we need to bridge the gap in vSphere CSI driver and return a combined SnapshotID to CSI Snapshotter.
2510+
var err error
25662511
var snapshotID string
25672512
var cnsSnapshotInfo *cnsvolume.CnsSnapshotInfo
25682513
var cnsVolumeInfo *cnsvolumeinfov1alpha1.CNSVolumeInfo
@@ -2618,6 +2563,17 @@ func (c *controller) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshot
26182563
"failed to create snapshot on volume %q with error: %v", volumeID, err)
26192564
}
26202565
}
2566+
// Query capacity in MB for block volume snapshot
2567+
volumeIds := []cnstypes.CnsVolumeId{{Id: volumeID}}
2568+
cnsVolumeDetailsMap, err := utils.QueryVolumeDetailsUtil(ctx, c.manager.VolumeManager, volumeIds)
2569+
if err != nil {
2570+
return nil, err
2571+
}
2572+
if _, ok := cnsVolumeDetailsMap[volumeID]; !ok {
2573+
return nil, logger.LogNewErrorCodef(log, codes.Internal,
2574+
"cns query volume did not return the volume: %s", volumeID)
2575+
}
2576+
snapshotSizeInMB := cnsVolumeDetailsMap[volumeID].SizeInMB
26212577
snapshotCreateTimeInProto := timestamppb.New(cnsSnapshotInfo.SnapshotLatestOperationCompleteTime)
26222578
createSnapshotResponse := &csi.CreateSnapshotResponse{
26232579
Snapshot: &csi.Snapshot{
@@ -2635,6 +2591,7 @@ func (c *controller) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshot
26352591
cnsSnapshotInfo.SnapshotLatestOperationCompleteTime, createSnapshotResponse)
26362592

26372593
volumeSnapshotName := req.Parameters[common.VolumeSnapshotNameKey]
2594+
volumeSnapshotNamespace := req.Parameters[common.VolumeSnapshotNamespaceKey]
26382595
log.Infof("Attempting to annotate volumesnapshot %s/%s with annotation %s:%s",
26392596
volumeSnapshotNamespace, volumeSnapshotName, common.VolumeSnapshotInfoKey, snapshotID)
26402597
annotated, err := commonco.ContainerOrchestratorUtility.AnnotateVolumeSnapshot(ctx, volumeSnapshotName,
@@ -2808,11 +2765,7 @@ func (c *controller) ControllerExpandVolume(ctx context.Context, req *csi.Contro
28082765
// Later we may need to define different csi faults.
28092766
// Check if the volume contains CNS snapshots only for block volumes.
28102767
if cnsVolumeType == common.UnknownVolumeType {
2811-
cnsVolumeType, err = common.GetCnsVolumeType(ctx, c.manager.VolumeManager, req.VolumeId)
2812-
if err != nil {
2813-
return nil, csifault.CSIInternalFault, logger.LogNewErrorCodef(log, codes.Internal,
2814-
"failed to determine CNS volume type for volume: %q. Error: %+v", req.VolumeId, err)
2815-
}
2768+
cnsVolumeType = common.GetCnsVolumeType(ctx, req.VolumeId)
28162769
volumeType = convertCnsVolumeType(ctx, cnsVolumeType)
28172770
}
28182771
if cnsVolumeType == common.BlockVolumeType &&

0 commit comments

Comments
 (0)