diff --git a/api/core/v1alpha2/virtual_machine.go b/api/core/v1alpha2/virtual_machine.go index b2a3e9e8d3..d3e4cc996b 100644 --- a/api/core/v1alpha2/virtual_machine.go +++ b/api/core/v1alpha2/virtual_machine.go @@ -119,6 +119,13 @@ type VirtualMachineSpec struct { // Devices are referenced by name of USBDevice resource in the same namespace. // +kubebuilder:validation:MaxItems:=8 USBDevices []USBDeviceSpecRef `json:"usbDevices,omitempty"` + // List of GPU devices to attach to the virtual machine. + // Devices are requested by GPU model. + // This feature requires the GPU feature gate and the gpu.deckhouse.io DeviceClass. + // +kubebuilder:validation:MaxItems:=16 + // +listType=map + // +listMapKey=name + GPUDevices []GPUDeviceSpec `json:"gpuDevices,omitempty"` } // RunPolicy parameter defines the VM startup policy @@ -497,6 +504,20 @@ type USBDeviceSpecRef struct { Name string `json:"name"` } +// GPUDeviceSpec requests a GPU device by model. +type GPUDeviceSpec struct { + // A unique GPU device name inside the virtual machine spec. + // The value is used to generate DRA claim and request names. + // +kubebuilder:validation:MinLength:=1 + // +kubebuilder:validation:MaxLength:=59 + // +kubebuilder:validation:Pattern:=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?$` + Name string `json:"name"` + // GPU product name, for example NVIDIA H100. + // +kubebuilder:validation:MinLength:=1 + // +kubebuilder:validation:MaxLength:=128 + Model string `json:"model"` +} + // USBDeviceStatusRef represents the status of a USB device attached to the virtual machine. type USBDeviceStatusRef struct { // The name of USBDevice resource. diff --git a/api/core/v1alpha2/zz_generated.deepcopy.go b/api/core/v1alpha2/zz_generated.deepcopy.go index 00434dce81..32d7ec2239 100644 --- a/api/core/v1alpha2/zz_generated.deepcopy.go +++ b/api/core/v1alpha2/zz_generated.deepcopy.go @@ -473,6 +473,22 @@ func (in *Disruptions) DeepCopy() *Disruptions { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUDeviceSpec) DeepCopyInto(out *GPUDeviceSpec) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUDeviceSpec. +func (in *GPUDeviceSpec) DeepCopy() *GPUDeviceSpec { + if in == nil { + return nil + } + out := new(GPUDeviceSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ImagePullSecret) DeepCopyInto(out *ImagePullSecret) { *out = *in @@ -3359,6 +3375,11 @@ func (in *VirtualMachineSpec) DeepCopyInto(out *VirtualMachineSpec) { *out = make([]USBDeviceSpecRef, len(*in)) copy(*out, *in) } + if in.GPUDevices != nil { + in, out := &in.GPUDevices, &out.GPUDevices + *out = make([]GPUDeviceSpec, len(*in)) + copy(*out, *in) + } return } diff --git a/build/components/versions.yml b/build/components/versions.yml index d1e78ec49c..374eec0c06 100644 --- a/build/components/versions.yml +++ b/build/components/versions.yml @@ -3,7 +3,7 @@ firmware: libvirt: v10.9.0 edk2: stable202411 core: - 3p-kubevirt: v1.6.2-v12n.47 + 3p-kubevirt: feat/gpu/add-deckhouse-dra-support 3p-containerized-data-importer: v1.60.3-v12n.20 distribution: 2.8.3 package: diff --git a/crds/doc-ru-virtualmachines.yaml b/crds/doc-ru-virtualmachines.yaml index 8a4697f05f..9be5b1005e 100644 --- a/crds/doc-ru-virtualmachines.yaml +++ b/crds/doc-ru-virtualmachines.yaml @@ -588,6 +588,20 @@ spec: name: description: | Имя ресурса `USBDevice` в том же пространстве имен. + gpuDevices: + description: | + Список GPU-устройств для подключения к виртуальной машине. + Устройства запрашиваются по модели GPU. + Для использования требуется feature gate `GPU` и DeviceClass `gpu.deckhouse.io`. + items: + properties: + model: + description: | + Название продукта GPU, например `NVIDIA H100`. + name: + description: | + Уникальное имя GPU-устройства внутри спецификации виртуальной машины. + Значение используется для генерации имён DRA claim и request. status: properties: blockDeviceRefs: diff --git a/crds/virtualmachines.yaml b/crds/virtualmachines.yaml index 1a1ff088d5..a523522c90 100644 --- a/crds/virtualmachines.yaml +++ b/crds/virtualmachines.yaml @@ -1038,6 +1038,36 @@ spec: type: string description: | The name of USBDevice resource in the same namespace. + gpuDevices: + type: array + maxItems: 16 + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + description: | + List of GPU devices to attach to the virtual machine. + Devices are requested by GPU model. + This feature requires the GPU feature gate and the gpu.deckhouse.io DeviceClass. + items: + type: object + required: + - model + - name + properties: + model: + minLength: 1 + maxLength: 128 + type: string + description: | + GPU product name, for example NVIDIA H100. + name: + minLength: 1 + maxLength: 59 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + description: | + A unique GPU device name inside the virtual machine spec. + The value is used to generate DRA claim and request names. status: type: object properties: diff --git a/docs/USER_GUIDE.md b/docs/USER_GUIDE.md index ae584d9d62..0ccecac7fd 100644 --- a/docs/USER_GUIDE.md +++ b/docs/USER_GUIDE.md @@ -3775,6 +3775,52 @@ spec: As a result, a VM named `clone-database-prod` and a disk named `clone-database-root-prod` will be created. +## GPU Devices + +{{< alert level="warning" >}} +GPU device passthrough is an experimental feature. It requires the Enterprise Edition (EE), Kubernetes DRA support, and an external GPU DRA provider that creates the `gpu.deckhouse.io` `DeviceClass`. +{{< /alert >}} + +The virtualization module can attach physical GPU devices to virtual machines using DRA (Dynamic Resource Allocation). A GPU is requested by product model through the `.spec.gpuDevices` field of the [VirtualMachine](/modules/virtualization/cr.html#virtualmachine) resource. + +GPU device passthrough requires: + +- Kubernetes version 1.34 or higher with DRA feature gates required by the cluster configuration. +- The `GPU` feature gate enabled in the `virtualization` module settings. +- A GPU DRA provider installed in the cluster. +- The `gpu.deckhouse.io` [DeviceClass](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#device-classes) created by the GPU DRA provider. + +To enable the module feature gate: + +```yaml +apiVersion: deckhouse.io/v1alpha1 +kind: ModuleConfig +metadata: + name: virtualization +spec: + settings: + featureGates: + - GPU +``` + +To request a GPU device, add `.spec.gpuDevices` to the VM specification: + +```yaml +apiVersion: virtualization.deckhouse.io/v1alpha2 +kind: VirtualMachine +metadata: + name: linux-vm +spec: + # ... other VM settings ... + gpuDevices: + - name: gpu0 + model: NVIDIA H100 +``` + +The `name` field must be unique within `.spec.gpuDevices` and can contain up to 59 DNS-label characters. The `model` field must match the GPU product name exposed by the GPU DRA provider in the `device.attributes["gpu.deckhouse.io"].productName` device attribute. + +Changing `.spec.gpuDevices` requires restarting the virtual machine to apply the new configuration. + ## USB Devices {{< alert level="warning" >}} diff --git a/docs/USER_GUIDE.ru.md b/docs/USER_GUIDE.ru.md index 42df93d945..cd1b723ab1 100644 --- a/docs/USER_GUIDE.ru.md +++ b/docs/USER_GUIDE.ru.md @@ -3806,6 +3806,52 @@ spec: В результате будет создана ВМ с именем `clone-database-prod` и диск с именем `clone-database-root-prod`. +## GPU-устройства + +{{< alert level="warning" >}} +Проброс GPU-устройств — экспериментальная возможность. Для работы требуются Enterprise Edition (EE), поддержка Kubernetes DRA и внешний GPU DRA-провайдер, создающий `DeviceClass` с именем `gpu.deckhouse.io`. +{{< /alert >}} + +Модуль виртуализации может подключать физические GPU-устройства к виртуальным машинам с помощью DRA (Dynamic Resource Allocation). GPU запрашивается по модели продукта через поле `.spec.gpuDevices` ресурса [VirtualMachine](/modules/virtualization/cr.html#virtualmachine). + +Для проброса GPU требуются: + +- Kubernetes версии 1.34 или выше с DRA feature gates, необходимыми для конфигурации кластера. +- Feature gate `GPU`, включённый в настройках модуля `virtualization`. +- Установленный в кластере GPU DRA-провайдер. +- [DeviceClass](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#device-classes) `gpu.deckhouse.io`, созданный GPU DRA-провайдером. + +Чтобы включить feature gate модуля: + +```yaml +apiVersion: deckhouse.io/v1alpha1 +kind: ModuleConfig +metadata: + name: virtualization +spec: + settings: + featureGates: + - GPU +``` + +Чтобы запросить GPU-устройство, добавьте `.spec.gpuDevices` в спецификацию ВМ: + +```yaml +apiVersion: virtualization.deckhouse.io/v1alpha2 +kind: VirtualMachine +metadata: + name: linux-vm +spec: + # ... другие настройки ВМ ... + gpuDevices: + - name: gpu0 + model: NVIDIA H100 +``` + +Поле `name` должно быть уникальным внутри `.spec.gpuDevices` и может содержать до 59 символов DNS label. Поле `model` должно совпадать с названием продукта GPU, которое GPU DRA-провайдер публикует в атрибуте устройства `device.attributes["gpu.deckhouse.io"].productName`. + +Изменение `.spec.gpuDevices` требует перезапуска виртуальной машины для применения новой конфигурации. + ## USB-устройства {{< alert level="warning">}} diff --git a/images/virt-artifact/werf.inc.yaml b/images/virt-artifact/werf.inc.yaml index 0d75ebb629..a6a797be21 100644 --- a/images/virt-artifact/werf.inc.yaml +++ b/images/virt-artifact/werf.inc.yaml @@ -13,8 +13,10 @@ secrets: - id: SOURCE_REPO value: {{ $.SOURCE_REPO }} shell: + installCacheVersion: "{{ now | date "Mon Jan 2 15:04:05 MST 2006" }}" install: - | + echo "$date" echo "Git clone {{ $gitRepoName }} repository..." git clone --depth=1 $(cat /run/secrets/SOURCE_REPO)/{{ $gitRepoUrl }} --branch {{ $tag }} /src/kubevirt diff --git a/images/virtualization-artifact/pkg/builder/vm/option.go b/images/virtualization-artifact/pkg/builder/vm/option.go index af0be0a1c8..749b370640 100644 --- a/images/virtualization-artifact/pkg/builder/vm/option.go +++ b/images/virtualization-artifact/pkg/builder/vm/option.go @@ -162,6 +162,12 @@ func WithUSBDevices(usbDevices []v1alpha2.USBDeviceSpecRef) Option { } } +func WithGPUDevices(gpuDevices []v1alpha2.GPUDeviceSpec) Option { + return func(vm *v1alpha2.VirtualMachine) { + vm.Spec.GPUDevices = gpuDevices + } +} + func WithIpAddress(ipAddress string) Option { return func(vm *v1alpha2.VirtualMachine) { vm.Spec.VirtualMachineIPAddress = ipAddress diff --git a/images/virtualization-artifact/pkg/controller/kvbuilder/gpu.go b/images/virtualization-artifact/pkg/controller/kvbuilder/gpu.go new file mode 100644 index 0000000000..de191eb84e --- /dev/null +++ b/images/virtualization-artifact/pkg/controller/kvbuilder/gpu.go @@ -0,0 +1,94 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kvbuilder + +import ( + "slices" + "strings" + + corev1 "k8s.io/api/core/v1" + "k8s.io/utils/ptr" + virtv1 "kubevirt.io/api/core/v1" + + "github.com/deckhouse/virtualization/api/core/v1alpha2" +) + +const ( + GPUNamePrefix = "gpu-" + GPUDeviceClassName = "gpu.deckhouse.io" +) + +func GPUResourceClaimName(deviceName string) string { + return GPUNamePrefix + deviceName +} + +func GPUResourceClaimTemplateName(vmName, deviceName string) string { + return vmName + "-" + deviceName +} + +func IsGPUResourceClaimTemplateName(vmName, templateName string) bool { + return strings.HasPrefix(templateName, vmName+"-") +} + +func (b *KVVM) SetGPUDevices(vmName string, devices []v1alpha2.GPUDeviceSpec) { + devices = SortGPUDevices(devices) + + b.Resource.Spec.Template.Spec.ResourceClaims = slices.DeleteFunc( + b.Resource.Spec.Template.Spec.ResourceClaims, + func(claim virtv1.ResourceClaim) bool { + return strings.HasPrefix(claim.Name, GPUNamePrefix) + }, + ) + b.Resource.Spec.Template.Spec.Domain.Devices.GPUs = slices.DeleteFunc( + b.Resource.Spec.Template.Spec.Domain.Devices.GPUs, + func(gpu virtv1.GPU) bool { + return strings.HasPrefix(gpu.Name, GPUNamePrefix) + }, + ) + + if len(devices) == 0 { + return + } + + for _, device := range devices { + claimName := GPUResourceClaimName(device.Name) + b.Resource.Spec.Template.Spec.ResourceClaims = append(b.Resource.Spec.Template.Spec.ResourceClaims, virtv1.ResourceClaim{ + PodResourceClaim: corev1.PodResourceClaim{ + Name: claimName, + ResourceClaimTemplateName: ptr.To(GPUResourceClaimTemplateName(vmName, device.Name)), + }, + }) + b.Resource.Spec.Template.Spec.Domain.Devices.GPUs = append(b.Resource.Spec.Template.Spec.Domain.Devices.GPUs, virtv1.GPU{ + Name: claimName, + ClaimRequest: &virtv1.ClaimRequest{ + ClaimName: ptr.To(claimName), + RequestName: ptr.To(GPUResourceClaimName(device.Name)), + }, + }) + } +} + +func SortGPUDevices(devices []v1alpha2.GPUDeviceSpec) []v1alpha2.GPUDeviceSpec { + if len(devices) == 0 { + return nil + } + sorted := slices.Clone(devices) + slices.SortFunc(sorted, func(a, b v1alpha2.GPUDeviceSpec) int { + return strings.Compare(a.Name, b.Name) + }) + return sorted +} diff --git a/images/virtualization-artifact/pkg/controller/kvbuilder/gpu_test.go b/images/virtualization-artifact/pkg/controller/kvbuilder/gpu_test.go new file mode 100644 index 0000000000..1babc5c931 --- /dev/null +++ b/images/virtualization-artifact/pkg/controller/kvbuilder/gpu_test.go @@ -0,0 +1,86 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kvbuilder + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/types" + + "github.com/deckhouse/virtualization/api/core/v1alpha2" +) + +var _ = Describe("GPU", func() { + It("should render DRA GPU resource claims", func() { + kvvm := NewEmptyKVVM(types.NamespacedName{Name: "vm-a", Namespace: "default"}, KVVMOptions{}) + + kvvm.SetGPUDevices("vm-a", []v1alpha2.GPUDeviceSpec{{Name: "gpu0", Model: "NVIDIA H100"}}) + res := kvvm.GetResource() + + Expect(res.Spec.Template.Spec.ResourceClaims).To(HaveLen(1)) + Expect(res.Spec.Template.Spec.ResourceClaims[0].Name).To(Equal("gpu-gpu0")) + Expect(*res.Spec.Template.Spec.ResourceClaims[0].ResourceClaimTemplateName).To(Equal("vm-a-gpu0")) + Expect(res.Spec.Template.Spec.Domain.Devices.GPUs).To(HaveLen(1)) + Expect(res.Spec.Template.Spec.Domain.Devices.GPUs[0].Name).To(Equal("gpu-gpu0")) + Expect(*res.Spec.Template.Spec.Domain.Devices.GPUs[0].ClaimName).To(Equal("gpu-gpu0")) + Expect(*res.Spec.Template.Spec.Domain.Devices.GPUs[0].RequestName).To(Equal("gpu-gpu0")) + Expect(res.Annotations).To(BeEmpty()) + }) + + It("should render DRA GPU resource claims in stable order", func() { + kvvm := NewEmptyKVVM(types.NamespacedName{Name: "vm-a", Namespace: "default"}, KVVMOptions{}) + + kvvm.SetGPUDevices("vm-a", []v1alpha2.GPUDeviceSpec{ + {Name: "gpu1", Model: "NVIDIA H100"}, + {Name: "gpu0", Model: "NVIDIA A100-SXM4-40GB"}, + }) + res := kvvm.GetResource() + + Expect(res.Spec.Template.Spec.ResourceClaims).To(HaveLen(2)) + Expect(res.Spec.Template.Spec.ResourceClaims[0].Name).To(Equal("gpu-gpu0")) + Expect(res.Spec.Template.Spec.ResourceClaims[1].Name).To(Equal("gpu-gpu1")) + Expect(res.Spec.Template.Spec.Domain.Devices.GPUs).To(HaveLen(2)) + Expect(res.Spec.Template.Spec.Domain.Devices.GPUs[0].Name).To(Equal("gpu-gpu0")) + Expect(res.Spec.Template.Spec.Domain.Devices.GPUs[1].Name).To(Equal("gpu-gpu1")) + }) + + It("should replace rendered DRA GPU resource claims", func() { + kvvm := NewEmptyKVVM(types.NamespacedName{Name: "vm-a", Namespace: "default"}, KVVMOptions{}) + kvvm.SetGPUDevices("vm-a", []v1alpha2.GPUDeviceSpec{{Name: "gpu0", Model: "NVIDIA H100"}}) + + kvvm.SetGPUDevices("vm-a", []v1alpha2.GPUDeviceSpec{{Name: "gpu1", Model: "NVIDIA A100-SXM4-40GB"}}) + res := kvvm.GetResource() + + Expect(res.Spec.Template.Spec.ResourceClaims).To(HaveLen(1)) + Expect(res.Spec.Template.Spec.ResourceClaims[0].Name).To(Equal("gpu-gpu1")) + Expect(*res.Spec.Template.Spec.ResourceClaims[0].ResourceClaimTemplateName).To(Equal("vm-a-gpu1")) + Expect(res.Spec.Template.Spec.Domain.Devices.GPUs).To(HaveLen(1)) + Expect(res.Spec.Template.Spec.Domain.Devices.GPUs[0].Name).To(Equal("gpu-gpu1")) + }) + + It("should remove rendered DRA GPU resource claims", func() { + kvvm := NewEmptyKVVM(types.NamespacedName{Name: "vm-a", Namespace: "default"}, KVVMOptions{}) + kvvm.SetGPUDevices("vm-a", []v1alpha2.GPUDeviceSpec{{Name: "gpu0", Model: "NVIDIA H100"}}) + + kvvm.SetGPUDevices("vm-a", nil) + res := kvvm.GetResource() + + Expect(res.Spec.Template.Spec.ResourceClaims).To(BeEmpty()) + Expect(res.Spec.Template.Spec.Domain.Devices.GPUs).To(BeEmpty()) + Expect(res.Annotations).To(BeEmpty()) + }) +}) diff --git a/images/virtualization-artifact/pkg/controller/kvbuilder/kvvm_utils.go b/images/virtualization-artifact/pkg/controller/kvbuilder/kvvm_utils.go index 72503e3004..10fcf8188a 100644 --- a/images/virtualization-artifact/pkg/controller/kvbuilder/kvvm_utils.go +++ b/images/virtualization-artifact/pkg/controller/kvbuilder/kvvm_utils.go @@ -133,6 +133,8 @@ func ApplyVirtualMachineSpec( return err } + kvvm.SetGPUDevices(vm.Name, vm.Spec.GPUDevices) + if err := kvvm.SetProvisioning(vm.Spec.Provisioning); err != nil { return err } diff --git a/images/virtualization-artifact/pkg/controller/kvbuilder/suite_test.go b/images/virtualization-artifact/pkg/controller/kvbuilder/suite_test.go new file mode 100644 index 0000000000..b51540d37d --- /dev/null +++ b/images/virtualization-artifact/pkg/controller/kvbuilder/suite_test.go @@ -0,0 +1,29 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kvbuilder + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestKVBuilder(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "KVBuilder Suite") +} diff --git a/images/virtualization-artifact/pkg/controller/vm/internal/gpu_resourceclaim_handler.go b/images/virtualization-artifact/pkg/controller/vm/internal/gpu_resourceclaim_handler.go new file mode 100644 index 0000000000..0f9a3dfbd8 --- /dev/null +++ b/images/virtualization-artifact/pkg/controller/vm/internal/gpu_resourceclaim_handler.go @@ -0,0 +1,160 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package internal + +import ( + "context" + "fmt" + "reflect" + "strconv" + + resourcev1 "k8s.io/api/resource/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/deckhouse/virtualization-controller/pkg/controller/kvbuilder" + "github.com/deckhouse/virtualization-controller/pkg/controller/service" + "github.com/deckhouse/virtualization-controller/pkg/controller/vm/internal/state" + "github.com/deckhouse/virtualization-controller/pkg/logger" + "github.com/deckhouse/virtualization/api/core/v1alpha2" +) + +const nameGPUResourceClaimHandler = "GPUResourceClaimHandler" + +func NewGPUResourceClaimHandler(client client.Client) *GPUResourceClaimHandler { + return &GPUResourceClaimHandler{client: client} +} + +type GPUResourceClaimHandler struct { + client client.Client +} + +func (h *GPUResourceClaimHandler) Name() string { + return nameGPUResourceClaimHandler +} + +func (h *GPUResourceClaimHandler) Handle(ctx context.Context, s state.VirtualMachineState) (reconcile.Result, error) { + if s.VirtualMachine().IsEmpty() { + return reconcile.Result{}, nil + } + + vm := s.VirtualMachine().Current() + log := logger.FromContext(ctx).With(logger.SlogHandler(nameGPUResourceClaimHandler)) + desiredTemplateNames := make(map[string]struct{}, len(vm.Spec.GPUDevices)) + + for _, device := range vm.Spec.GPUDevices { + templateName := kvbuilder.GPUResourceClaimTemplateName(vm.Name, device.Name) + desiredTemplateNames[templateName] = struct{}{} + desiredSpec := buildGPUResourceClaimTemplateSpec(device) + template := &resourcev1.ResourceClaimTemplate{} + key := types.NamespacedName{Name: templateName, Namespace: vm.Namespace} + + err := h.client.Get(ctx, key, template) + if err != nil && !apierrors.IsNotFound(err) { + return reconcile.Result{}, fmt.Errorf("failed to get GPU ResourceClaimTemplate: %w", err) + } + + if apierrors.IsNotFound(err) { + template = buildGPUResourceClaimTemplate(vm, templateName, desiredSpec) + if err := h.client.Create(ctx, template); err != nil && !apierrors.IsAlreadyExists(err) { + return reconcile.Result{}, fmt.Errorf("failed to create GPU ResourceClaimTemplate: %w", err) + } + log.Info("created GPU ResourceClaimTemplate", "template", templateName) + continue + } + + if !metav1.IsControlledBy(template, vm) { + return reconcile.Result{}, fmt.Errorf("GPU ResourceClaimTemplate %s/%s is not controlled by VirtualMachine %s/%s", template.Namespace, template.Name, vm.Namespace, vm.Name) + } + + if reflect.DeepEqual(template.Spec, desiredSpec) { + continue + } + if err := h.client.Delete(ctx, template); err != nil && !apierrors.IsNotFound(err) { + return reconcile.Result{}, fmt.Errorf("failed to delete outdated GPU ResourceClaimTemplate: %w", err) + } + template = buildGPUResourceClaimTemplate(vm, templateName, desiredSpec) + if err := h.client.Create(ctx, template); err != nil && !apierrors.IsAlreadyExists(err) { + return reconcile.Result{}, fmt.Errorf("failed to recreate GPU ResourceClaimTemplate: %w", err) + } + log.Info("recreated GPU ResourceClaimTemplate", "template", templateName) + } + + if err := h.deleteOrphanedTemplates(ctx, vm, desiredTemplateNames); err != nil { + return reconcile.Result{}, err + } + return reconcile.Result{}, nil +} + +func buildGPUResourceClaimTemplate(vm *v1alpha2.VirtualMachine, name string, spec resourcev1.ResourceClaimTemplateSpec) *resourcev1.ResourceClaimTemplate { + return &resourcev1.ResourceClaimTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: vm.Namespace, + OwnerReferences: []metav1.OwnerReference{service.MakeControllerOwnerReference(vm)}, + }, + Spec: spec, + } +} + +func buildGPUResourceClaimTemplateSpec(device v1alpha2.GPUDeviceSpec) resourcev1.ResourceClaimTemplateSpec { + selector := fmt.Sprintf( + `device.attributes["gpu.deckhouse.io"].productName == %s && device.attributes["gpu.deckhouse.io"].deviceType == "physical" && !has(device.attributes["gpu.deckhouse.io"].sharingStrategy)`, + strconv.Quote(device.Model), + ) + return resourcev1.ResourceClaimTemplateSpec{ + Spec: resourcev1.ResourceClaimSpec{ + Devices: resourcev1.DeviceClaim{ + Requests: []resourcev1.DeviceRequest{{ + Name: kvbuilder.GPUResourceClaimName(device.Name), + Exactly: &resourcev1.ExactDeviceRequest{ + DeviceClassName: kvbuilder.GPUDeviceClassName, + AllocationMode: resourcev1.DeviceAllocationModeExactCount, + Count: 1, + Selectors: []resourcev1.DeviceSelector{{ + CEL: &resourcev1.CELDeviceSelector{Expression: selector}, + }}, + }, + }}, + }, + }, + } +} + +func (h *GPUResourceClaimHandler) deleteOrphanedTemplates(ctx context.Context, vm *v1alpha2.VirtualMachine, desiredTemplateNames map[string]struct{}) error { + var templates resourcev1.ResourceClaimTemplateList + if err := h.client.List(ctx, &templates, client.InNamespace(vm.Namespace)); err != nil { + return fmt.Errorf("failed to list GPU ResourceClaimTemplates: %w", err) + } + + for i := range templates.Items { + template := &templates.Items[i] + if !metav1.IsControlledBy(template, vm) || !kvbuilder.IsGPUResourceClaimTemplateName(vm.Name, template.Name) { + continue + } + if _, ok := desiredTemplateNames[template.Name]; ok { + continue + } + if err := h.client.Delete(ctx, template); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("failed to delete GPU ResourceClaimTemplate: %w", err) + } + } + return nil +} diff --git a/images/virtualization-artifact/pkg/controller/vm/internal/gpu_resourceclaim_handler_test.go b/images/virtualization-artifact/pkg/controller/vm/internal/gpu_resourceclaim_handler_test.go new file mode 100644 index 0000000000..cfd48dbf02 --- /dev/null +++ b/images/virtualization-artifact/pkg/controller/vm/internal/gpu_resourceclaim_handler_test.go @@ -0,0 +1,93 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package internal + +import ( + context "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + resourcev1 "k8s.io/api/resource/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + "github.com/deckhouse/virtualization-controller/pkg/controller/kvbuilder" + "github.com/deckhouse/virtualization/api/core/v1alpha2" +) + +var _ = Describe("GPUResourceClaimHandler", func() { + const ( + vmName = "vm-a" + namespace = "default" + gpuModel = "NVIDIA H100" + ) + + newVM := func(devices ...v1alpha2.GPUDeviceSpec) *v1alpha2.VirtualMachine { + return &v1alpha2.VirtualMachine{ + ObjectMeta: metav1.ObjectMeta{Name: vmName, Namespace: namespace}, + Spec: v1alpha2.VirtualMachineSpec{GPUDevices: devices}, + } + } + + It("should create GPU ResourceClaimTemplate", func() { + fakeClient, _, vmState := setupEnvironment(newVM(v1alpha2.GPUDeviceSpec{Name: "gpu0", Model: gpuModel})) + handler := NewGPUResourceClaimHandler(fakeClient) + + _, err := handler.Handle(context.Background(), vmState) + + Expect(err).NotTo(HaveOccurred()) + template := &resourcev1.ResourceClaimTemplate{} + Expect(fakeClient.Get(context.Background(), types.NamespacedName{Name: kvbuilder.GPUResourceClaimTemplateName(vmName, "gpu0"), Namespace: namespace}, template)).To(Succeed()) + Expect(template.Spec.Spec.Devices.Requests).To(HaveLen(1)) + request := template.Spec.Spec.Devices.Requests[0] + Expect(request.Name).To(Equal(kvbuilder.GPUResourceClaimName("gpu0"))) + Expect(request.Exactly.DeviceClassName).To(Equal(kvbuilder.GPUDeviceClassName)) + Expect(request.Exactly.Selectors[0].CEL.Expression).To(ContainSubstring(`productName == "NVIDIA H100"`)) + Expect(request.Exactly.Selectors[0].CEL.Expression).To(ContainSubstring(`deviceType == "physical"`)) + Expect(request.Exactly.Selectors[0].CEL.Expression).To(ContainSubstring(`!has(device.attributes["gpu.deckhouse.io"].sharingStrategy)`)) + }) + + It("should delete owned GPU ResourceClaimTemplate when annotation is removed", func() { + vm := newVM() + template := buildGPUResourceClaimTemplate(vm, kvbuilder.GPUResourceClaimTemplateName(vmName, "gpu0"), buildGPUResourceClaimTemplateSpec(v1alpha2.GPUDeviceSpec{Name: "gpu0", Model: gpuModel})) + fakeClient, _, vmState := setupEnvironment(vm, template) + handler := NewGPUResourceClaimHandler(fakeClient) + + _, err := handler.Handle(context.Background(), vmState) + + Expect(err).NotTo(HaveOccurred()) + stored := &resourcev1.ResourceClaimTemplate{} + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: kvbuilder.GPUResourceClaimTemplateName(vmName, "gpu0"), Namespace: namespace}, stored) + Expect(err).To(HaveOccurred()) + }) + + It("should not replace GPU ResourceClaimTemplate owned by another controller", func() { + vm := newVM(v1alpha2.GPUDeviceSpec{Name: "gpu0", Model: gpuModel}) + template := &resourcev1.ResourceClaimTemplate{ + ObjectMeta: metav1.ObjectMeta{Name: kvbuilder.GPUResourceClaimTemplateName(vmName, "gpu0"), Namespace: namespace}, + } + fakeClient, _, vmState := setupEnvironment(vm, template) + handler := NewGPUResourceClaimHandler(fakeClient) + + _, err := handler.Handle(context.Background(), vmState) + + Expect(err).To(HaveOccurred()) + stored := &resourcev1.ResourceClaimTemplate{} + Expect(fakeClient.Get(context.Background(), types.NamespacedName{Name: template.Name, Namespace: namespace}, stored)).To(Succeed()) + Expect(stored.OwnerReferences).To(BeEmpty()) + }) +}) diff --git a/images/virtualization-artifact/pkg/controller/vm/internal/sync_kvvm_test.go b/images/virtualization-artifact/pkg/controller/vm/internal/sync_kvvm_test.go index 7c2b292069..a3b9d610aa 100644 --- a/images/virtualization-artifact/pkg/controller/vm/internal/sync_kvvm_test.go +++ b/images/virtualization-artifact/pkg/controller/vm/internal/sync_kvvm_test.go @@ -308,6 +308,50 @@ var _ = Describe("SyncKvvmHandler", func() { Entry("Pending phase without changes, shouldn't have condition", v1alpha2.MachinePending, false, metav1.ConditionUnknown, false), ) + It("should require restart when GPU devices change on a running VM", func() { + ip := makeVMIP() + vmClass := makeVMClass() + + vm := makeVM(v1alpha2.MachineRunning) + vm.Spec.GPUDevices = []v1alpha2.GPUDeviceSpec{{Name: "gpu0", Model: "NVIDIA H100"}} + kvvm := makeKVVM(vm) + Expect(kvbuilder.SetLastAppliedSpec(kvvm, &v1alpha2.VirtualMachine{ + Spec: v1alpha2.VirtualMachineSpec{ + CPU: v1alpha2.CPUSpec{ + Cores: vm.Spec.CPU.Cores, + }, + Memory: v1alpha2.MemorySpec{ + Size: vm.Spec.Memory.Size, + }, + VirtualMachineIPAddress: vm.Spec.VirtualMachineIPAddress, + RunPolicy: vm.Spec.RunPolicy, + OsType: vm.Spec.OsType, + VirtualMachineClassName: vm.Spec.VirtualMachineClassName, + Disruptions: &v1alpha2.Disruptions{ + RestartApprovalMode: vm.Spec.Disruptions.RestartApprovalMode, + }, + GPUDevices: []v1alpha2.GPUDeviceSpec{{Name: "gpu0", Model: "NVIDIA A100-SXM4-40GB"}}, + }, + })).To(Succeed()) + kvvm.SetGroupVersionKind(virtv1.VirtualMachineGroupVersionKind) + kvvmi := makeKVVMI() + + fakeClient, reconcileObj, vmState = setupEnvironment(vm, kvvm, kvvmi, ip, vmClass) + + reconcile() + + newVM := &v1alpha2.VirtualMachine{} + Expect(fakeClient.Get(ctx, client.ObjectKeyFromObject(vm), newVM)).To(Succeed()) + awaitCond, awaitExists := conditions.GetCondition(vmcondition.TypeAwaitingRestartToApplyConfiguration, newVM.Status.Conditions) + Expect(awaitExists).To(BeTrue()) + Expect(awaitCond.Status).To(Equal(metav1.ConditionTrue)) + Expect(newVM.Status.RestartAwaitingChanges).NotTo(BeEmpty()) + + updatedKVVM := &virtv1.VirtualMachine{} + Expect(fakeClient.Get(ctx, client.ObjectKeyFromObject(kvvm), updatedKVVM)).To(Succeed()) + Expect(updatedKVVM.Spec.Template.Spec.Domain.Devices.GPUs).To(BeEmpty()) + }) + DescribeTable("AwaitingRestart Condition for NonMigratable VM", func(phase v1alpha2.MachinePhase, featureGate featuregate.FeatureGate, mutateFn func(fakeClient client.WithWatch, vm *v1alpha2.VirtualMachine, kvvm *virtv1.VirtualMachine), expectedStatus metav1.ConditionStatus, expectedExistence bool) { ip := makeVMIP() diff --git a/images/virtualization-artifact/pkg/controller/vm/internal/validators/gpu_devices_validator.go b/images/virtualization-artifact/pkg/controller/vm/internal/validators/gpu_devices_validator.go new file mode 100644 index 0000000000..e5a92fe51c --- /dev/null +++ b/images/virtualization-artifact/pkg/controller/vm/internal/validators/gpu_devices_validator.go @@ -0,0 +1,69 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package validators + +import ( + "context" + "fmt" + + resourcev1 "k8s.io/api/resource/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/component-base/featuregate" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + + "github.com/deckhouse/virtualization-controller/pkg/controller/kvbuilder" + "github.com/deckhouse/virtualization-controller/pkg/featuregates" + "github.com/deckhouse/virtualization/api/core/v1alpha2" +) + +type GPUDevicesValidator struct { + client client.Client + featureGate featuregate.FeatureGate +} + +func NewGPUDevicesValidator(client client.Client, featureGate featuregate.FeatureGate) *GPUDevicesValidator { + return &GPUDevicesValidator{client: client, featureGate: featureGate} +} + +func (v *GPUDevicesValidator) ValidateCreate(ctx context.Context, vm *v1alpha2.VirtualMachine) (admission.Warnings, error) { + return nil, v.validateGPUDevices(ctx, vm) +} + +func (v *GPUDevicesValidator) ValidateUpdate(ctx context.Context, _, newVM *v1alpha2.VirtualMachine) (admission.Warnings, error) { + return nil, v.validateGPUDevices(ctx, newVM) +} + +func (v *GPUDevicesValidator) validateGPUDevices(ctx context.Context, vm *v1alpha2.VirtualMachine) error { + if len(vm.Spec.GPUDevices) == 0 { + return nil + } + + if !v.featureGate.Enabled(featuregates.GPU) { + return fmt.Errorf("GPU device attachment requires the GPU feature gate") + } + + deviceClass := &resourcev1.DeviceClass{} + err := v.client.Get(ctx, client.ObjectKey{Name: kvbuilder.GPUDeviceClassName}, deviceClass) + if err == nil { + return nil + } + if apierrors.IsNotFound(err) { + return fmt.Errorf("GPU device attachment requires DeviceClass %q", kvbuilder.GPUDeviceClassName) + } + return fmt.Errorf("failed to get GPU DeviceClass %q: %w", kvbuilder.GPUDeviceClassName, err) +} diff --git a/images/virtualization-artifact/pkg/controller/vm/internal/validators/gpu_devices_validator_test.go b/images/virtualization-artifact/pkg/controller/vm/internal/validators/gpu_devices_validator_test.go new file mode 100644 index 0000000000..133b622aa6 --- /dev/null +++ b/images/virtualization-artifact/pkg/controller/vm/internal/validators/gpu_devices_validator_test.go @@ -0,0 +1,122 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package validators + +import ( + "strings" + "testing" + + resourcev1 "k8s.io/api/resource/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/component-base/featuregate" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/deckhouse/virtualization-controller/pkg/controller/kvbuilder" + "github.com/deckhouse/virtualization-controller/pkg/featuregates" + "github.com/deckhouse/virtualization/api/core/v1alpha2" +) + +func TestGPUDevicesValidatorValidateCreate(t *testing.T) { + tests := []struct { + name string + featureEnabled bool + objects []client.Object + wantErrorPart string + }{ + { + name: "should reject GPU devices when feature is disabled", + featureEnabled: false, + objects: []client.Object{newGPUDeviceClass()}, + wantErrorPart: "GPU feature gate", + }, + { + name: "should reject GPU devices when DeviceClass is missing", + featureEnabled: true, + wantErrorPart: "DeviceClass", + }, + { + name: "should accept GPU devices when feature and DeviceClass are available", + featureEnabled: true, + objects: []client.Object{newGPUDeviceClass()}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + vm := newVirtualMachineWithGPU("vm-current", []v1alpha2.GPUDeviceSpec{{Name: "gpu0", Model: "NVIDIA H100"}}) + validator := NewGPUDevicesValidator(newFakeClientWithResourceObjects(t, tt.objects...), newGPUFeatureGate(t, tt.featureEnabled)) + + _, err := validator.ValidateCreate(t.Context(), vm) + + if tt.wantErrorPart == "" { + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + return + } + + if err == nil { + t.Fatal("expected error, got nil") + } + if !strings.Contains(err.Error(), tt.wantErrorPart) { + t.Fatalf("expected error containing %q, got %v", tt.wantErrorPart, err) + } + }) + } +} + +func newVirtualMachineWithGPU(name string, gpuDevices []v1alpha2.GPUDeviceSpec) *v1alpha2.VirtualMachine { + return &v1alpha2.VirtualMachine{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "default"}, + Spec: v1alpha2.VirtualMachineSpec{GPUDevices: gpuDevices}, + } +} + +func newGPUDeviceClass() *resourcev1.DeviceClass { + return &resourcev1.DeviceClass{ObjectMeta: metav1.ObjectMeta{Name: kvbuilder.GPUDeviceClassName}} +} + +func newFakeClientWithResourceObjects(t *testing.T, objects ...client.Object) client.Client { + t.Helper() + + scheme := runtime.NewScheme() + if err := v1alpha2.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add virtualization API scheme: %v", err) + } + if err := resourcev1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add resource API scheme: %v", err) + } + + return fake.NewClientBuilder().WithScheme(scheme).WithObjects(objects...).Build() +} + +func newGPUFeatureGate(t *testing.T, enabled bool) featuregate.FeatureGate { + t.Helper() + + gate, setFromMap, err := featuregates.NewUnlocked() + if err != nil { + t.Fatalf("failed to create feature gate: %v", err) + } + + if err = setFromMap(map[string]bool{string(featuregates.GPU): enabled}); err != nil { + t.Fatalf("failed to set GPU feature gate: %v", err) + } + + return gate +} diff --git a/images/virtualization-artifact/pkg/controller/vm/internal/watcher/resourceclaimtemplate_watcher.go b/images/virtualization-artifact/pkg/controller/vm/internal/watcher/resourceclaimtemplate_watcher.go new file mode 100644 index 0000000000..14dabd46d1 --- /dev/null +++ b/images/virtualization-artifact/pkg/controller/vm/internal/watcher/resourceclaimtemplate_watcher.go @@ -0,0 +1,53 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package watcher + +import ( + "fmt" + + resourcev1 "k8s.io/api/resource/v1" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/source" + + "github.com/deckhouse/virtualization/api/core/v1alpha2" +) + +func NewResourceClaimTemplateWatcher() *ResourceClaimTemplateWatcher { + return &ResourceClaimTemplateWatcher{} +} + +type ResourceClaimTemplateWatcher struct{} + +func (w *ResourceClaimTemplateWatcher) Watch(mgr manager.Manager, ctr controller.Controller) error { + if err := ctr.Watch( + source.Kind( + mgr.GetCache(), + &resourcev1.ResourceClaimTemplate{}, + handler.TypedEnqueueRequestForOwner[*resourcev1.ResourceClaimTemplate]( + mgr.GetScheme(), + mgr.GetRESTMapper(), + &v1alpha2.VirtualMachine{}, + handler.OnlyControllerOwner(), + ), + ), + ); err != nil { + return fmt.Errorf("error setting watch on ResourceClaimTemplate: %w", err) + } + return nil +} diff --git a/images/virtualization-artifact/pkg/controller/vm/vm_controller.go b/images/virtualization-artifact/pkg/controller/vm/vm_controller.go index ebc466e99e..228a3f6814 100644 --- a/images/virtualization-artifact/pkg/controller/vm/vm_controller.go +++ b/images/virtualization-artifact/pkg/controller/vm/vm_controller.go @@ -71,6 +71,7 @@ func SetupController( internal.NewBlockDeviceHandler(client, blockDeviceService), internal.NewUSBDeviceDetachHandler(client, virtClient), internal.NewUSBDeviceAttachHandler(client, virtClient), + internal.NewGPUResourceClaimHandler(client), internal.NewProvisioningHandler(client), internal.NewAgentHandler(), internal.NewFilesystemHandler(), diff --git a/images/virtualization-artifact/pkg/controller/vm/vm_reconciler.go b/images/virtualization-artifact/pkg/controller/vm/vm_reconciler.go index 300b65acab..2d49683161 100644 --- a/images/virtualization-artifact/pkg/controller/vm/vm_reconciler.go +++ b/images/virtualization-artifact/pkg/controller/vm/vm_reconciler.go @@ -71,6 +71,7 @@ func (r *Reconciler) SetupController(_ context.Context, mgr manager.Manager, ctr watcher.NewClusterVirtualImageWatcher(mgr.GetClient()), watcher.NewVirtualDiskWatcher(mgr.GetClient()), watcher.NewUSBDeviceWatcher(mgr.GetClient()), + watcher.NewResourceClaimTemplateWatcher(), watcher.NewVMIPWatcher(), watcher.NewVirtualMachineClassWatcher(), watcher.NewVirtualMachineSnapshotWatcher(), diff --git a/images/virtualization-artifact/pkg/controller/vm/vm_webhook.go b/images/virtualization-artifact/pkg/controller/vm/vm_webhook.go index d3afa74586..119484eb41 100644 --- a/images/virtualization-artifact/pkg/controller/vm/vm_webhook.go +++ b/images/virtualization-artifact/pkg/controller/vm/vm_webhook.go @@ -56,6 +56,7 @@ func NewValidator(client client.Client, blockDeviceService *service.BlockDeviceS validators.NewNetworksValidator(client, featureGate), validators.NewFirstDiskValidator(client), validators.NewUSBDevicesValidator(client, featureGate), + validators.NewGPUDevicesValidator(client, featureGate), validators.NewVMBDAConflictValidator(client), validators.NewPVNodeAffinityValidator(client, attachmentService), }, diff --git a/images/virtualization-artifact/pkg/controller/vmchange/compare.go b/images/virtualization-artifact/pkg/controller/vmchange/compare.go index d734167761..b350187440 100644 --- a/images/virtualization-artifact/pkg/controller/vmchange/compare.go +++ b/images/virtualization-artifact/pkg/controller/vmchange/compare.go @@ -81,6 +81,7 @@ func (v *VMSpecComparator) comparators() []VMSpecFieldComparator { vmSpecFieldComparator(compareProvisioning), vmSpecFieldComparator(compareNetworks), vmSpecFieldComparator(compareUSBDevices), + vmSpecFieldComparator(compareGPUDevices), } } diff --git a/images/virtualization-artifact/pkg/controller/vmchange/compare_test.go b/images/virtualization-artifact/pkg/controller/vmchange/compare_test.go index 8377ac10fa..240c3ecdae 100644 --- a/images/virtualization-artifact/pkg/controller/vmchange/compare_test.go +++ b/images/virtualization-artifact/pkg/controller/vmchange/compare_test.go @@ -684,6 +684,43 @@ networks: requirePathOperation("networks", ChangeReplace), ), }, + { + "no restart when gpu devices only change order", + ` +gpuDevices: +- name: gpu1 + model: NVIDIA H100 +- name: gpu0 + model: NVIDIA A100-SXM4-40GB +`, + ` +gpuDevices: +- name: gpu0 + model: NVIDIA A100-SXM4-40GB +- name: gpu1 + model: NVIDIA H100 +`, + nil, + assertNoChanges(), + }, + { + "restart when gpu device model changes", + ` +gpuDevices: +- name: gpu0 + model: NVIDIA A100-SXM4-40GB +`, + ` +gpuDevices: +- name: gpu0 + model: NVIDIA H100 +`, + nil, + assertChanges( + actionRequired(ActionRestart), + requirePathOperation("gpuDevices", ChangeReplace), + ), + }, } for _, tt := range tests { @@ -769,6 +806,13 @@ func assertChanges(asserts ...func(t *testing.T, changes SpecChanges)) func(t *t } } +func assertNoChanges() func(t *testing.T, changes SpecChanges) { + return func(t *testing.T, changes SpecChanges) { + t.Helper() + require.True(t, changes.IsEmpty(), "expected no changes, got %+v", changes.GetAll()) + } +} + func actionRequired(actionType ActionType) func(t *testing.T, changes SpecChanges) { return func(t *testing.T, changes SpecChanges) { t.Helper() diff --git a/images/virtualization-artifact/pkg/controller/vmchange/gpu_change.go b/images/virtualization-artifact/pkg/controller/vmchange/gpu_change.go new file mode 100644 index 0000000000..a7be7c057d --- /dev/null +++ b/images/virtualization-artifact/pkg/controller/vmchange/gpu_change.go @@ -0,0 +1,39 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vmchange + +import ( + "reflect" + + "github.com/deckhouse/virtualization-controller/pkg/controller/kvbuilder" + "github.com/deckhouse/virtualization/api/core/v1alpha2" +) + +func compareGPUDevices(current, desired *v1alpha2.VirtualMachineSpec) []FieldChange { + currentGPUDevices := kvbuilder.SortGPUDevices(current.GPUDevices) + desiredGPUDevices := kvbuilder.SortGPUDevices(desired.GPUDevices) + currentValue := NewValue(currentGPUDevices, current.GPUDevices == nil, false) + desiredValue := NewValue(desiredGPUDevices, desired.GPUDevices == nil, false) + + return compareValues( + "gpuDevices", + currentValue, + desiredValue, + reflect.DeepEqual(currentGPUDevices, desiredGPUDevices), + ActionRestart, + ) +} diff --git a/images/virtualization-artifact/pkg/featuregates/featuregate.go b/images/virtualization-artifact/pkg/featuregates/featuregate.go index 3358b0a85a..b9acd85efc 100644 --- a/images/virtualization-artifact/pkg/featuregates/featuregate.go +++ b/images/virtualization-artifact/pkg/featuregates/featuregate.go @@ -30,6 +30,7 @@ const ( VolumeMigration featuregate.Feature = "VolumeMigration" TargetMigration featuregate.Feature = "TargetMigration" USB featuregate.Feature = "USB" + GPU featuregate.Feature = "GPU" HotplugCPUWithLiveMigration featuregate.Feature = "HotplugCPUWithLiveMigration" HotplugMemoryWithLiveMigration featuregate.Feature = "HotplugMemoryWithLiveMigration" ) @@ -59,6 +60,11 @@ var featureSpecs = map[featuregate.Feature]featuregate.FeatureSpec{ LockToDefault: true, PreRelease: featuregate.Alpha, }, + GPU: { + Default: false, + LockToDefault: version.GetEdition() == version.EditionCE, + PreRelease: featuregate.Alpha, + }, HotplugCPUWithLiveMigration: { Default: false, LockToDefault: version.GetEdition() == version.EditionCE, diff --git a/openapi/config-values.yaml b/openapi/config-values.yaml index 5d3afcd436..cfe6816780 100644 --- a/openapi/config-values.yaml +++ b/openapi/config-values.yaml @@ -225,8 +225,10 @@ properties: - `HotplugCPUWithLiveMigration` — enable live changing of cpu cores number. (Not available in CE); - `HotplugMemoryWithLiveMigration` — enable live changing of memory size. (Not available in CE); + - `GPU` — enable attaching GPU devices to virtual machines via DRA by product model. (Not available in CE); items: type: string enum: - "HotplugCPUWithLiveMigration" - "HotplugMemoryWithLiveMigration" + - "GPU" diff --git a/openapi/doc-ru-config-values.yaml b/openapi/doc-ru-config-values.yaml index 8579d3e3ee..0d1c905be3 100644 --- a/openapi/doc-ru-config-values.yaml +++ b/openapi/doc-ru-config-values.yaml @@ -155,5 +155,6 @@ properties: - `HotplugCPUWithLiveMigration` — включить изменение количества ядер процессора без перезагрузки. (Не доступно в CE); - `HotplugMemoryWithLiveMigration` — включить изменение размера памяти без перезагрузки. (Не доступно в CE); + - `GPU` — включить подключение GPU-устройств к виртуальным машинам через DRA по модели продукта. (Не доступно в CE); items: type: string diff --git a/templates/virtualization-controller/rbac-for-us.yaml b/templates/virtualization-controller/rbac-for-us.yaml index 6b46cbfc5a..be5bd4392d 100644 --- a/templates/virtualization-controller/rbac-for-us.yaml +++ b/templates/virtualization-controller/rbac-for-us.yaml @@ -327,6 +327,14 @@ rules: - update - patch - delete +- apiGroups: + - resource.k8s.io + resources: + - deviceclasses + verbs: + - get + - list + - watch - apiGroups: - apiextensions.k8s.io resources: