Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmd/mapt/cmd/azure/hosts/rhelai.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ func getRHELAICreate() *cobra.Command {
Version: viper.GetString(params.RhelAIVersion),
Accelerator: viper.GetString(params.RhelAIAccelerator),
CustomImage: viper.GetString(params.RhelAICustomImage),
Marketplace: viper.GetBool(params.RhelAIMarketplace),
ComputeRequest: params.ComputeRequestArgs(),
Spot: params.SpotArgs(),
Timeout: viper.GetString(params.Timeout),
Expand All @@ -75,6 +76,7 @@ func getRHELAICreate() *cobra.Command {
flagSet.StringP(params.RhelAIVersion, "", params.RhelAIVersionDefault, params.RhelAIVersionDesc)
flagSet.StringP(params.RhelAIAccelerator, "", params.RhelAIAccelearatorDefault, params.RhelAIAccelearatorDesc)
flagSet.StringP(params.RhelAICustomImage, "", "", params.RhelAICustomImageDesc)
flagSet.Bool(params.RhelAIMarketplace, false, params.RhelAIMarketplaceDesc)
flagSet.StringP(params.Timeout, "", "", params.TimeoutDesc)
params.AddComputeRequestFlags(flagSet)
params.AddSpotFlags(flagSet)
Expand Down
2 changes: 2 additions & 0 deletions cmd/mapt/cmd/params/params.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ const (
RhelAIAccelearatorDefault string = "cuda"
RhelAICustomImage string = "custom-image"
RhelAICustomImageDesc string = "custom image name to spin RHEL AI OS (AMI name for AWS, image name for Azure)"
RhelAIMarketplace string = "marketplace"
RhelAIMarketplaceDesc string = "use the cloud provider's marketplace RHEL AI image instead of a shared gallery or custom image"

// Serverless
Timeout string = "timeout"
Expand Down
84 changes: 67 additions & 17 deletions pkg/provider/azure/action/rhel-ai/rhelai.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (

maptContext "github.com/redhat-developer/mapt/pkg/manager/context"
azureLinux "github.com/redhat-developer/mapt/pkg/provider/azure/action/linux"
cr "github.com/redhat-developer/mapt/pkg/provider/api/compute-request"
"github.com/redhat-developer/mapt/pkg/provider/azure/data"
"github.com/redhat-developer/mapt/pkg/provider/util/command"
apiRHELAI "github.com/redhat-developer/mapt/pkg/target/host/rhelai"
Expand All @@ -22,6 +23,13 @@ const (
// $1 subscriptionId $2 rgName $3 galleryName $4 imageName
imageIdRegex = "/subscriptions/%s/resourceGroups/" + imageOwnerResourceGroup + "/providers/Microsoft.Compute/galleries/%s/images/%s/versions/1.0.0"

// Marketplace image coordinates
marketplacePublisher = "RedHat"
marketplaceOffer = "rh-rhel-ai"
marketplacePlanPublisher = "redhat"
// SKU pattern: rh-rhelai-{nvidia|amd}-{N}gpu (gen2 handled by SkuG2Support)
marketplaceSkuRegex = "rh-rhelai-%s-%dgpu"

username = "azureuser"
)

Expand All @@ -37,6 +45,13 @@ func imageId(accelerator, version string) string {
return imageIdFromName(fmt.Sprintf(imageNameRegex, accelerator, version))
}

var acceleratorToMarketplace = map[string]string{
"cuda": "nvidia",
"rocm": "amd",
}

var validMarketplaceGPUCounts = map[int32]bool{1: true, 2: true, 4: true, 8: true}

// isGPUCapableSize returns true for ND-series and NC-series Azure VM sizes,
// which are the compute GPU families supported for RHEL AI workloads.
// NV-series (visualization GPUs) is intentionally excluded.
Expand All @@ -50,10 +65,6 @@ func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err
return fmt.Errorf("RHEL AI: args and ComputeRequest must not be nil")
}
logging.Debug("Creating RHEL AI Server")
sharedImageID := imageId(args.Accelerator, args.Version)
if args.CustomImage != "" {
sharedImageID = imageIdFromName(args.CustomImage)
}
// Shallow-copy to avoid mutating the caller's ComputeRequestArgs.
computeReq := *args.ComputeRequest
// Ensure GPU-capable instance selection for auto-selection paths.
Expand All @@ -68,27 +79,66 @@ func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err
return fmt.Errorf("RHEL AI: %q is not GPU-capable (expected ND-series or NC-series for vllm)", s)
}
}
imageRef, err := resolveImageSource(args, &computeReq)
if err != nil {
return err
}
azureLinuxRequest :=
&azureLinux.LinuxArgs{
Prefix: args.Prefix,
ComputeRequest: &computeReq,
Spot: args.Spot,
ImageRef: &data.ImageReference{
SharedImageID: sharedImageID,
// Belt-and-suspenders: set SCSI explicitly so Azure never infers a
// conflicting default. resolveImageRef will also derive this from the
// gallery image's Features, but the static value protects against API
// failures or future images with multiple supported types.
DiskControllerType: "SCSI",
},
Prefix: args.Prefix,
ComputeRequest: &computeReq,
Spot: args.Spot,
ImageRef: imageRef,
Username: username,
ReadinessCommand: command.CommandPing}
if err = azureLinux.Create(mCtxArgs, azureLinuxRequest); err != nil && len(computeReq.ComputeSizes) == 0 {
return fmt.Errorf("RHEL AI: failed to provision a GPU-capable instance (ND/NC-series required for vllm); verify GPU quota in the target location/subscription: %w", err)
if err = azureLinux.Create(mCtxArgs, azureLinuxRequest); err != nil {
if args.Marketplace && imageRef.Plan != nil &&
(strings.Contains(err.Error(), "ResourcePurchaseValidationFailed") ||
strings.Contains(err.Error(), "MarketplacePurchaseEligibilityFailed")) {
return fmt.Errorf("RHEL AI marketplace: terms not accepted; run: az vm image terms accept --publisher %s --offer %s --plan %s\n%w",
imageRef.Plan.Publisher, marketplaceOffer, imageRef.Plan.Name, err)
}
if len(computeReq.ComputeSizes) == 0 {
return fmt.Errorf("RHEL AI: failed to provision a GPU-capable instance (ND/NC-series required for vllm); verify GPU quota in the target location/subscription: %w", err)
}
}
return err
}

func resolveImageSource(args *apiRHELAI.RHELAIArgs, computeReq *cr.ComputeRequestArgs) (*data.ImageReference, error) {
if args.Marketplace {
gpus := computeReq.GPUs
if !validMarketplaceGPUCounts[gpus] {
return nil, fmt.Errorf("RHEL AI marketplace: --gpus must be 1, 2, 4, or 8 (got %d)", gpus)
}
accName, ok := acceleratorToMarketplace[strings.ToLower(args.Accelerator)]
if !ok {
return nil, fmt.Errorf("RHEL AI marketplace: unsupported accelerator %q (expected cuda or rocm)", args.Accelerator)
}
sku := fmt.Sprintf(marketplaceSkuRegex, accName, gpus)
return &data.ImageReference{
Publisher: marketplacePublisher,
Offer: marketplaceOffer,
Sku: sku,
Plan: &data.MarketplacePlan{
Name: sku,
Product: marketplaceOffer,
Publisher: marketplacePlanPublisher,
},
}, nil
}
if args.CustomImage != "" {
return &data.ImageReference{
SharedImageID: imageIdFromName(args.CustomImage),
DiskControllerType: "SCSI",
}, nil
}
return &data.ImageReference{
SharedImageID: imageId(args.Accelerator, args.Version),
DiskControllerType: "SCSI",
}, nil
}

func Destroy(mCtxArgs *maptContext.ContextArgs) error {
return azureLinux.Destroy(mCtxArgs)
}
Expand Down
8 changes: 8 additions & 0 deletions pkg/provider/azure/data/imageref.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ const fedoraImageGalleryBase = "/CommunityGalleries/Fedora-5e266ba4-2250-406d-ad

// /subscriptions/02db6bd4-035c-4074-b699-468f3d914744/resourceGroups/RHEL-AI-CUDA-AZURE-3.0.0/providers/Microsoft.Compute/galleries/rhel_ai_cuda_azure_3.0.0/images/rhel-ai-cuda-azure-3.0.0/versions/1.0.0

type MarketplacePlan struct {
Name string
Product string
Publisher string
}

type ImageReference struct {
// Market Place
Publisher string
Expand All @@ -33,6 +39,8 @@ type ImageReference struct {
// Required disk controller type for this image (e.g. "SCSI", "NVMe").
// Empty means no specific requirement; Azure uses the VM size default.
DiskControllerType string
// Non-nil when the image requires a purchase plan (e.g. marketplace images).
Plan *MarketplacePlan
}

var (
Expand Down
14 changes: 12 additions & 2 deletions pkg/provider/azure/modules/virtual-machine/virtual-machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ type VirtualMachine = *compute.VirtualMachine
// Create virtual machine based on request + export to context
// adminusername and adminuserpassword
func Create(ctx *pulumi.Context, mCtx *mc.Context, args *VirtualMachineArgs) (VirtualMachine, error) {
ira, err := convertImageRef(mCtx, *args.Image, args.Location)
ira, err := convertImageRef(mCtx, args.Image, args.Location)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -99,6 +99,13 @@ func Create(ctx *pulumi.Context, mCtx *mc.Context, args *VirtualMachineArgs) (Vi
MaxPrice: pulumi.Float64(*args.SpotPrice),
}
}
if args.Image.Plan != nil {
vmArgs.Plan = compute.PlanArgs{
Name: pulumi.String(args.Image.Plan.Name),
Product: pulumi.String(args.Image.Plan.Product),
Publisher: pulumi.String(args.Image.Plan.Publisher),
}
}
logging.Debug("About to create the VM with compute.NewVirtualMachine")
return compute.NewVirtualMachine(ctx,
resourcesUtil.GetResourceName(args.Prefix, args.ComponentID, "vm"),
Expand Down Expand Up @@ -130,7 +137,7 @@ func osProfile(computerName string, args *VirtualMachineArgs) compute.OSProfileA
return osProfile
}

func convertImageRef(mCtx *mc.Context, i data.ImageReference, location string) (*compute.ImageReferenceArgs, error) {
func convertImageRef(mCtx *mc.Context, i *data.ImageReference, location string) (*compute.ImageReferenceArgs, error) {
if len(i.CommunityImageID) > 0 {
return &compute.ImageReferenceArgs{
CommunityGalleryImageId: pulumi.String(i.CommunityImageID),
Expand All @@ -151,6 +158,9 @@ func convertImageRef(mCtx *mc.Context, i data.ImageReference, location string) (
if err != nil {
return nil, err
}
if i.Plan != nil && finalSku != i.Sku {
i.Plan.Name = finalSku
}
return &compute.ImageReferenceArgs{
Publisher: pulumi.String(i.Publisher),
Offer: pulumi.String(i.Offer),
Expand Down
1 change: 1 addition & 0 deletions pkg/target/host/rhelai/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ type RHELAIArgs struct {
Accelerator string
Version string
CustomImage string
Marketplace bool
Arch string
ComputeRequest *cr.ComputeRequestArgs
Spot *spotTypes.SpotArgs
Expand Down
8 changes: 7 additions & 1 deletion tkn/infra-azure-rhel-ai.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ spec:
- name: version
description: Version of RHEL AI OS (default 3.2.0)
default: "3.2.0"
- name: marketplace
description: Use the cloud provider's marketplace RHEL AI image instead of a shared gallery or custom image
default: "false"

# Metadata params
- name: tags
Expand Down Expand Up @@ -226,7 +229,10 @@ spec:
if [[ "$(params.compute-sizes)" != "" ]]; then
cmd+="--compute-sizes '$(params.compute-sizes)' "
fi
if [[ "$(params.custom-image)" != "" ]]; then
if [[ "$(params.marketplace)" == "true" ]]; then
cmd+="--marketplace "
cmd+="--accelerator '$(params.accelerator)' "
elif [[ "$(params.custom-image)" != "" ]]; then
cmd+="--custom-image '$(params.custom-image)' "
else
cmd+="--accelerator '$(params.accelerator)' "
Expand Down
20 changes: 19 additions & 1 deletion tkn/template/infra-azure-rhel-ai.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ spec:
- name: disk-size
description: Disk size in GB for the cloud instance
default: "200"
- name: gpus
description: Number of GPUs for the cloud instance (valid marketplace values are 1, 2, 4, 8)
default: "8"
- name: gpu-manufacturer
description: GPU manufacturer name for instance filtering (e.g. NVIDIA, AMD)
default: ""
- name: compute-sizes
description: Comma seperated list of sizes for the machines to be requested. If set this takes precedence over compute by args
default: "Standard_ND96is_MI300X_v5,Standard_ND96isr_MI300X_v5"
Expand All @@ -110,6 +116,9 @@ spec:
- name: version
description: Version of RHEL AI OS (default 3.2.0)
default: "3.2.0"
- name: marketplace
description: Use the cloud provider's marketplace RHEL AI image instead of a shared gallery or custom image
default: "false"

# Metadata params
- name: tags
Expand Down Expand Up @@ -226,7 +235,16 @@ spec:
if [[ "$(params.compute-sizes)" != "" ]]; then
cmd+="--compute-sizes '$(params.compute-sizes)' "
fi
if [[ "$(params.custom-image)" != "" ]]; then
if [[ "$(params.gpus)" != "" ]]; then
cmd+="--gpus '$(params.gpus)' "
fi
if [[ "$(params.gpu-manufacturer)" != "" ]]; then
cmd+="--gpu-manufacturer '$(params.gpu-manufacturer)' "
fi
if [[ "$(params.marketplace)" == "true" ]]; then
cmd+="--marketplace "
cmd+="--accelerator '$(params.accelerator)' "
elif [[ "$(params.custom-image)" != "" ]]; then
cmd+="--custom-image '$(params.custom-image)' "
else
cmd+="--accelerator '$(params.accelerator)' "
Expand Down