From 79f6ca3d6280825bc2726169a0c79bed7e35a97a Mon Sep 17 00:00:00 2001 From: Pavol Pitonak Date: Wed, 24 Jun 2026 15:37:21 +0200 Subject: [PATCH] feat(azure/rhel-ai): add Azure Marketplace image support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add --marketplace flag to deploy RHEL AI from the Azure Marketplace instead of the shared gallery. The SKU is constructed from accelerator type (cuda→nvidia, rocm→amd) and GPU count (1/2/4/8), with gen2 handling delegated to SkuG2Support. Includes a Plan block on the VM resource for marketplace purchase plan acceptance and a helpful error message when marketplace terms have not been accepted. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Pavol Pitonak --- cmd/mapt/cmd/azure/hosts/rhelai.go | 2 + cmd/mapt/cmd/params/params.go | 2 + pkg/provider/azure/action/rhel-ai/rhelai.go | 84 +++++++++++++++---- pkg/provider/azure/data/imageref.go | 8 ++ .../virtual-machine/virtual-machine.go | 14 +++- pkg/target/host/rhelai/api.go | 1 + tkn/infra-azure-rhel-ai.yaml | 8 +- tkn/template/infra-azure-rhel-ai.yaml | 20 ++++- 8 files changed, 118 insertions(+), 21 deletions(-) diff --git a/cmd/mapt/cmd/azure/hosts/rhelai.go b/cmd/mapt/cmd/azure/hosts/rhelai.go index 29c78b53d..5055015fe 100644 --- a/cmd/mapt/cmd/azure/hosts/rhelai.go +++ b/cmd/mapt/cmd/azure/hosts/rhelai.go @@ -63,6 +63,7 @@ func getRHELAICreate() *cobra.Command { Version: viper.GetString(params.RhelAIVersion), Accelerator: viper.GetString(params.RhelAIAccelerator), CustomImage: viper.GetString(params.RhelAICustomImage), + Marketplace: viper.GetBool(params.RhelAIMarketplace), ComputeRequest: params.ComputeRequestArgs(), Spot: params.SpotArgs(), Timeout: viper.GetString(params.Timeout), @@ -75,6 +76,7 @@ func getRHELAICreate() *cobra.Command { flagSet.StringP(params.RhelAIVersion, "", params.RhelAIVersionDefault, params.RhelAIVersionDesc) flagSet.StringP(params.RhelAIAccelerator, "", params.RhelAIAccelearatorDefault, params.RhelAIAccelearatorDesc) flagSet.StringP(params.RhelAICustomImage, "", "", params.RhelAICustomImageDesc) + flagSet.Bool(params.RhelAIMarketplace, false, params.RhelAIMarketplaceDesc) flagSet.StringP(params.Timeout, "", "", params.TimeoutDesc) params.AddComputeRequestFlags(flagSet) params.AddSpotFlags(flagSet) diff --git a/cmd/mapt/cmd/params/params.go b/cmd/mapt/cmd/params/params.go index f6583a661..c0dfad656 100644 --- a/cmd/mapt/cmd/params/params.go +++ b/cmd/mapt/cmd/params/params.go @@ -119,6 +119,8 @@ const ( RhelAIAccelearatorDefault string = "cuda" RhelAICustomImage string = "custom-image" RhelAICustomImageDesc string = "custom image name to spin RHEL AI OS (AMI name for AWS, image name for Azure)" + RhelAIMarketplace string = "marketplace" + RhelAIMarketplaceDesc string = "use the cloud provider's marketplace RHEL AI image instead of a shared gallery or custom image" // Serverless Timeout string = "timeout" diff --git a/pkg/provider/azure/action/rhel-ai/rhelai.go b/pkg/provider/azure/action/rhel-ai/rhelai.go index cb254fe51..ed3d3fc40 100644 --- a/pkg/provider/azure/action/rhel-ai/rhelai.go +++ b/pkg/provider/azure/action/rhel-ai/rhelai.go @@ -8,6 +8,7 @@ import ( maptContext "github.com/redhat-developer/mapt/pkg/manager/context" azureLinux "github.com/redhat-developer/mapt/pkg/provider/azure/action/linux" + cr "github.com/redhat-developer/mapt/pkg/provider/api/compute-request" "github.com/redhat-developer/mapt/pkg/provider/azure/data" "github.com/redhat-developer/mapt/pkg/provider/util/command" apiRHELAI "github.com/redhat-developer/mapt/pkg/target/host/rhelai" @@ -22,6 +23,13 @@ const ( // $1 subscriptionId $2 rgName $3 galleryName $4 imageName imageIdRegex = "/subscriptions/%s/resourceGroups/" + imageOwnerResourceGroup + "/providers/Microsoft.Compute/galleries/%s/images/%s/versions/1.0.0" + // Marketplace image coordinates + marketplacePublisher = "RedHat" + marketplaceOffer = "rh-rhel-ai" + marketplacePlanPublisher = "redhat" + // SKU pattern: rh-rhelai-{nvidia|amd}-{N}gpu (gen2 handled by SkuG2Support) + marketplaceSkuRegex = "rh-rhelai-%s-%dgpu" + username = "azureuser" ) @@ -37,6 +45,13 @@ func imageId(accelerator, version string) string { return imageIdFromName(fmt.Sprintf(imageNameRegex, accelerator, version)) } +var acceleratorToMarketplace = map[string]string{ + "cuda": "nvidia", + "rocm": "amd", +} + +var validMarketplaceGPUCounts = map[int32]bool{1: true, 2: true, 4: true, 8: true} + // isGPUCapableSize returns true for ND-series and NC-series Azure VM sizes, // which are the compute GPU families supported for RHEL AI workloads. // NV-series (visualization GPUs) is intentionally excluded. @@ -50,10 +65,6 @@ func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err return fmt.Errorf("RHEL AI: args and ComputeRequest must not be nil") } logging.Debug("Creating RHEL AI Server") - sharedImageID := imageId(args.Accelerator, args.Version) - if args.CustomImage != "" { - sharedImageID = imageIdFromName(args.CustomImage) - } // Shallow-copy to avoid mutating the caller's ComputeRequestArgs. computeReq := *args.ComputeRequest // Ensure GPU-capable instance selection for auto-selection paths. @@ -68,27 +79,66 @@ func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err return fmt.Errorf("RHEL AI: %q is not GPU-capable (expected ND-series or NC-series for vllm)", s) } } + imageRef, err := resolveImageSource(args, &computeReq) + if err != nil { + return err + } azureLinuxRequest := &azureLinux.LinuxArgs{ - Prefix: args.Prefix, - ComputeRequest: &computeReq, - Spot: args.Spot, - ImageRef: &data.ImageReference{ - SharedImageID: sharedImageID, - // Belt-and-suspenders: set SCSI explicitly so Azure never infers a - // conflicting default. resolveImageRef will also derive this from the - // gallery image's Features, but the static value protects against API - // failures or future images with multiple supported types. - DiskControllerType: "SCSI", - }, + Prefix: args.Prefix, + ComputeRequest: &computeReq, + Spot: args.Spot, + ImageRef: imageRef, Username: username, ReadinessCommand: command.CommandPing} - if err = azureLinux.Create(mCtxArgs, azureLinuxRequest); err != nil && len(computeReq.ComputeSizes) == 0 { - return fmt.Errorf("RHEL AI: failed to provision a GPU-capable instance (ND/NC-series required for vllm); verify GPU quota in the target location/subscription: %w", err) + if err = azureLinux.Create(mCtxArgs, azureLinuxRequest); err != nil { + if args.Marketplace && imageRef.Plan != nil && + (strings.Contains(err.Error(), "ResourcePurchaseValidationFailed") || + strings.Contains(err.Error(), "MarketplacePurchaseEligibilityFailed")) { + return fmt.Errorf("RHEL AI marketplace: terms not accepted; run: az vm image terms accept --publisher %s --offer %s --plan %s\n%w", + imageRef.Plan.Publisher, marketplaceOffer, imageRef.Plan.Name, err) + } + if len(computeReq.ComputeSizes) == 0 { + return fmt.Errorf("RHEL AI: failed to provision a GPU-capable instance (ND/NC-series required for vllm); verify GPU quota in the target location/subscription: %w", err) + } } return err } +func resolveImageSource(args *apiRHELAI.RHELAIArgs, computeReq *cr.ComputeRequestArgs) (*data.ImageReference, error) { + if args.Marketplace { + gpus := computeReq.GPUs + if !validMarketplaceGPUCounts[gpus] { + return nil, fmt.Errorf("RHEL AI marketplace: --gpus must be 1, 2, 4, or 8 (got %d)", gpus) + } + accName, ok := acceleratorToMarketplace[strings.ToLower(args.Accelerator)] + if !ok { + return nil, fmt.Errorf("RHEL AI marketplace: unsupported accelerator %q (expected cuda or rocm)", args.Accelerator) + } + sku := fmt.Sprintf(marketplaceSkuRegex, accName, gpus) + return &data.ImageReference{ + Publisher: marketplacePublisher, + Offer: marketplaceOffer, + Sku: sku, + Plan: &data.MarketplacePlan{ + Name: sku, + Product: marketplaceOffer, + Publisher: marketplacePlanPublisher, + }, + }, nil + } + if args.CustomImage != "" { + return &data.ImageReference{ + SharedImageID: imageIdFromName(args.CustomImage), + DiskControllerType: "SCSI", + }, nil + } + return &data.ImageReference{ + SharedImageID: imageId(args.Accelerator, args.Version), + DiskControllerType: "SCSI", + }, nil +} + func Destroy(mCtxArgs *maptContext.ContextArgs) error { return azureLinux.Destroy(mCtxArgs) } diff --git a/pkg/provider/azure/data/imageref.go b/pkg/provider/azure/data/imageref.go index fb483c833..18f516d6a 100644 --- a/pkg/provider/azure/data/imageref.go +++ b/pkg/provider/azure/data/imageref.go @@ -21,6 +21,12 @@ const fedoraImageGalleryBase = "/CommunityGalleries/Fedora-5e266ba4-2250-406d-ad // /subscriptions/02db6bd4-035c-4074-b699-468f3d914744/resourceGroups/RHEL-AI-CUDA-AZURE-3.0.0/providers/Microsoft.Compute/galleries/rhel_ai_cuda_azure_3.0.0/images/rhel-ai-cuda-azure-3.0.0/versions/1.0.0 +type MarketplacePlan struct { + Name string + Product string + Publisher string +} + type ImageReference struct { // Market Place Publisher string @@ -33,6 +39,8 @@ type ImageReference struct { // Required disk controller type for this image (e.g. "SCSI", "NVMe"). // Empty means no specific requirement; Azure uses the VM size default. DiskControllerType string + // Non-nil when the image requires a purchase plan (e.g. marketplace images). + Plan *MarketplacePlan } var ( diff --git a/pkg/provider/azure/modules/virtual-machine/virtual-machine.go b/pkg/provider/azure/modules/virtual-machine/virtual-machine.go index 87d1899c6..e3f4849e7 100644 --- a/pkg/provider/azure/modules/virtual-machine/virtual-machine.go +++ b/pkg/provider/azure/modules/virtual-machine/virtual-machine.go @@ -47,7 +47,7 @@ type VirtualMachine = *compute.VirtualMachine // Create virtual machine based on request + export to context // adminusername and adminuserpassword func Create(ctx *pulumi.Context, mCtx *mc.Context, args *VirtualMachineArgs) (VirtualMachine, error) { - ira, err := convertImageRef(mCtx, *args.Image, args.Location) + ira, err := convertImageRef(mCtx, args.Image, args.Location) if err != nil { return nil, err } @@ -99,6 +99,13 @@ func Create(ctx *pulumi.Context, mCtx *mc.Context, args *VirtualMachineArgs) (Vi MaxPrice: pulumi.Float64(*args.SpotPrice), } } + if args.Image.Plan != nil { + vmArgs.Plan = compute.PlanArgs{ + Name: pulumi.String(args.Image.Plan.Name), + Product: pulumi.String(args.Image.Plan.Product), + Publisher: pulumi.String(args.Image.Plan.Publisher), + } + } logging.Debug("About to create the VM with compute.NewVirtualMachine") return compute.NewVirtualMachine(ctx, resourcesUtil.GetResourceName(args.Prefix, args.ComponentID, "vm"), @@ -130,7 +137,7 @@ func osProfile(computerName string, args *VirtualMachineArgs) compute.OSProfileA return osProfile } -func convertImageRef(mCtx *mc.Context, i data.ImageReference, location string) (*compute.ImageReferenceArgs, error) { +func convertImageRef(mCtx *mc.Context, i *data.ImageReference, location string) (*compute.ImageReferenceArgs, error) { if len(i.CommunityImageID) > 0 { return &compute.ImageReferenceArgs{ CommunityGalleryImageId: pulumi.String(i.CommunityImageID), @@ -151,6 +158,9 @@ func convertImageRef(mCtx *mc.Context, i data.ImageReference, location string) ( if err != nil { return nil, err } + if i.Plan != nil && finalSku != i.Sku { + i.Plan.Name = finalSku + } return &compute.ImageReferenceArgs{ Publisher: pulumi.String(i.Publisher), Offer: pulumi.String(i.Offer), diff --git a/pkg/target/host/rhelai/api.go b/pkg/target/host/rhelai/api.go index 9676c7a35..857e9fb47 100644 --- a/pkg/target/host/rhelai/api.go +++ b/pkg/target/host/rhelai/api.go @@ -10,6 +10,7 @@ type RHELAIArgs struct { Accelerator string Version string CustomImage string + Marketplace bool Arch string ComputeRequest *cr.ComputeRequestArgs Spot *spotTypes.SpotArgs diff --git a/tkn/infra-azure-rhel-ai.yaml b/tkn/infra-azure-rhel-ai.yaml index fd0e95392..21c9e87b0 100644 --- a/tkn/infra-azure-rhel-ai.yaml +++ b/tkn/infra-azure-rhel-ai.yaml @@ -110,6 +110,9 @@ spec: - name: version description: Version of RHEL AI OS (default 3.2.0) default: "3.2.0" + - name: marketplace + description: Use the cloud provider's marketplace RHEL AI image instead of a shared gallery or custom image + default: "false" # Metadata params - name: tags @@ -226,7 +229,10 @@ spec: if [[ "$(params.compute-sizes)" != "" ]]; then cmd+="--compute-sizes '$(params.compute-sizes)' " fi - if [[ "$(params.custom-image)" != "" ]]; then + if [[ "$(params.marketplace)" == "true" ]]; then + cmd+="--marketplace " + cmd+="--accelerator '$(params.accelerator)' " + elif [[ "$(params.custom-image)" != "" ]]; then cmd+="--custom-image '$(params.custom-image)' " else cmd+="--accelerator '$(params.accelerator)' " diff --git a/tkn/template/infra-azure-rhel-ai.yaml b/tkn/template/infra-azure-rhel-ai.yaml index b6f26bae1..bf07f860a 100644 --- a/tkn/template/infra-azure-rhel-ai.yaml +++ b/tkn/template/infra-azure-rhel-ai.yaml @@ -85,6 +85,12 @@ spec: - name: disk-size description: Disk size in GB for the cloud instance default: "200" + - name: gpus + description: Number of GPUs for the cloud instance (valid marketplace values are 1, 2, 4, 8) + default: "8" + - name: gpu-manufacturer + description: GPU manufacturer name for instance filtering (e.g. NVIDIA, AMD) + default: "" - name: compute-sizes description: Comma seperated list of sizes for the machines to be requested. If set this takes precedence over compute by args default: "Standard_ND96is_MI300X_v5,Standard_ND96isr_MI300X_v5" @@ -110,6 +116,9 @@ spec: - name: version description: Version of RHEL AI OS (default 3.2.0) default: "3.2.0" + - name: marketplace + description: Use the cloud provider's marketplace RHEL AI image instead of a shared gallery or custom image + default: "false" # Metadata params - name: tags @@ -226,7 +235,16 @@ spec: if [[ "$(params.compute-sizes)" != "" ]]; then cmd+="--compute-sizes '$(params.compute-sizes)' " fi - if [[ "$(params.custom-image)" != "" ]]; then + if [[ "$(params.gpus)" != "" ]]; then + cmd+="--gpus '$(params.gpus)' " + fi + if [[ "$(params.gpu-manufacturer)" != "" ]]; then + cmd+="--gpu-manufacturer '$(params.gpu-manufacturer)' " + fi + if [[ "$(params.marketplace)" == "true" ]]; then + cmd+="--marketplace " + cmd+="--accelerator '$(params.accelerator)' " + elif [[ "$(params.custom-image)" != "" ]]; then cmd+="--custom-image '$(params.custom-image)' " else cmd+="--accelerator '$(params.accelerator)' "