Skip to content

Commit 58e8ded

Browse files
Add LaunchTemplateOverrides support for compute resources
Allow customers to specify LaunchTemplateOverrides with a LaunchTemplateId and Version at the compute resource level. This enables customers to customize RunInstances parameters (e.g. NetworkInterfaces) that are not directly exposed by ParallelCluster config, without requiring changes to the ParallelCluster codebase for each new parameter. The launch template data is fetched, transformed into run_instances_overrides.json (keyed by queue/compute resource name), and uploaded to the cluster S3 bucket. The file is always uploaded (empty {} when no overrides) to cleanly handle cluster update transitions.
1 parent 9c208e4 commit 58e8ded

8 files changed

Lines changed: 182 additions & 0 deletions

File tree

cli/src/pcluster/aws/ec2.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,22 @@ def describe_image(self, ami_id):
162162
return ImageInfo(images[0])
163163
raise AWSClientError(function_name="describe_images", message=f"Image {ami_id} not found")
164164

165+
@AWSExceptionHandler.handle_client_exception
166+
@Cache.cached
167+
def describe_launch_template_version(self, launch_template_id, version):
168+
"""Describe a specific launch template version and return its LaunchTemplateData."""
169+
response = self._client.describe_launch_template_versions(
170+
LaunchTemplateId=launch_template_id,
171+
Versions=[str(version)],
172+
)
173+
versions = response.get("LaunchTemplateVersions", [])
174+
if not versions:
175+
raise AWSClientError(
176+
function_name="describe_launch_template_versions",
177+
message=f"Launch template {launch_template_id} version {version} not found",
178+
)
179+
return versions[0].get("LaunchTemplateData", {})
180+
165181
@AWSExceptionHandler.handle_client_exception
166182
@Cache.cached
167183
def describe_images(self, ami_ids, filters, owners):

cli/src/pcluster/config/cluster_config.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@
158158
InstanceTypePlacementGroupValidator,
159159
InstanceTypeValidator,
160160
KeyPairValidator,
161+
LaunchTemplateOverridesValidator,
161162
PlacementGroupCapacityReservationValidator,
162163
PlacementGroupCapacityTypeValidator,
163164
PlacementGroupNamingValidator,
@@ -1631,6 +1632,7 @@ def __init__(
16311632
self.managed_head_node_security_group = None
16321633
self.managed_compute_security_group = None
16331634
self.instance_types_data_version = ""
1635+
self.run_instances_overrides_version = ""
16341636

16351637
def _register_validators(self, context: ValidatorContext = None): # noqa: D102 #pylint: disable=unused-argument
16361638
self._register_validator(RegionValidator, region=self.region)
@@ -2222,6 +2224,15 @@ def scheduler_resources(self):
22222224
return str(files(__package__).parent / "resources" / "batch")
22232225

22242226

2227+
class LaunchTemplateOverrides(Resource):
2228+
"""Represent the LaunchTemplateOverrides configuration for a compute resource."""
2229+
2230+
def __init__(self, launch_template_id: str = None, version: int = None, **kwargs):
2231+
super().__init__(**kwargs)
2232+
self.launch_template_id = Resource.init_param(launch_template_id)
2233+
self.version = Resource.init_param(version)
2234+
2235+
22252236
class _BaseSlurmComputeResource(BaseComputeResource):
22262237
"""Represent the Slurm Compute Resource."""
22272238

@@ -2240,6 +2251,7 @@ def __init__(
22402251
tags: List[Tag] = None,
22412252
static_node_priority: int = None,
22422253
dynamic_node_priority: int = None,
2254+
launch_template_overrides=None,
22432255
**kwargs,
22442256
):
22452257
super().__init__(**kwargs)
@@ -2260,6 +2272,7 @@ def __init__(
22602272
self.tags = tags
22612273
self.static_node_priority = Resource.init_param(static_node_priority, default=1)
22622274
self.dynamic_node_priority = Resource.init_param(dynamic_node_priority, default=1000)
2275+
self.launch_template_overrides = launch_template_overrides
22632276

22642277
@abstractmethod
22652278
def is_flexible(self) -> bool:
@@ -2362,6 +2375,15 @@ def _register_validators(self, context: ValidatorContext = None):
23622375
ec2memory=min_memory,
23632376
instance_type=smallest_type,
23642377
)
2378+
if self.launch_template_overrides:
2379+
self._register_validator(
2380+
LaunchTemplateOverridesValidator,
2381+
launch_template_id=self.launch_template_overrides.launch_template_id,
2382+
version=self.launch_template_overrides.version,
2383+
instance_types=self.instance_types,
2384+
max_network_cards=self.max_network_cards,
2385+
is_flexible=self.is_flexible(),
2386+
)
23652387

23662388
def is_flexible(self):
23672389
"""Return True because the ComputeResource can contain multiple instance types."""
@@ -2449,6 +2471,15 @@ def _register_validators(self, context: ValidatorContext = None):
24492471
ec2memory=self._instance_type_info.ec2memory_size_in_mib(),
24502472
instance_type=self.instance_type,
24512473
)
2474+
if self.launch_template_overrides:
2475+
self._register_validator(
2476+
LaunchTemplateOverridesValidator,
2477+
launch_template_id=self.launch_template_overrides.launch_template_id,
2478+
version=self.launch_template_overrides.version,
2479+
instance_types=self.instance_types,
2480+
max_network_cards=self.max_network_cards,
2481+
is_flexible=self.is_flexible(),
2482+
)
24522483

24532484
@property
24542485
def architecture(self) -> str:
@@ -2975,6 +3006,40 @@ def get_instance_types_data(self):
29753006
result[instance_type] = instance_type_info.instance_type_data
29763007
return result
29773008

3009+
def get_run_instances_overrides(self):
3010+
"""
3011+
Build run_instances_overrides data from LaunchTemplateOverrides config.
3012+
3013+
Iterates all queues and compute resources. For each compute resource that has
3014+
launch_template_overrides configured, fetches the launch template data.
3015+
3016+
Returns a dict keyed by {queue_name} -> {compute_resource_name} -> {launch_template_data}.
3017+
Returns empty dict if no overrides are configured.
3018+
"""
3019+
overrides = {}
3020+
for queue in self.scheduling.queues:
3021+
for compute_resource in queue.compute_resources:
3022+
if not getattr(compute_resource, "launch_template_overrides", None):
3023+
continue
3024+
3025+
lt_overrides = compute_resource.launch_template_overrides
3026+
lt_id = lt_overrides.launch_template_id
3027+
lt_version = lt_overrides.version
3028+
3029+
LOGGER.info(
3030+
"Fetching launch template %s version %s for queue %s, compute resource %s",
3031+
lt_id,
3032+
lt_version,
3033+
queue.name,
3034+
compute_resource.name,
3035+
)
3036+
lt_data = AWSApi.instance().ec2.describe_launch_template_version(lt_id, lt_version)
3037+
3038+
if lt_data:
3039+
overrides.setdefault(queue.name, {})[compute_resource.name] = lt_data
3040+
3041+
return overrides
3042+
29783043
@property
29793044
def login_nodes_ami(self):
29803045
"""Get the image id of the LoginNodes."""

cli/src/pcluster/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@
232232
"custom_artifacts_name": "artifacts.zip",
233233
"scheduler_resources_name": "scheduler_resources.zip",
234234
"change_set_name": "change-set.json",
235+
"run_instances_overrides_name": "run_instances_overrides.json",
235236
}
236237

237238
PCLUSTER_TAG_VALUE_REGEX = r"^([\w\+\-\=\.\_\:\@/]{0,256})$"

cli/src/pcluster/models/cluster.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,7 @@ def create(
374374
artifact_dir_generated = True
375375
self._upload_config()
376376
self._upload_instance_types_data()
377+
self._upload_run_instances_overrides()
377378
LOGGER.info("Generation and upload completed successfully")
378379

379380
# Create template if not provided by the user
@@ -558,6 +559,25 @@ def _upload_instance_types_data(self):
558559
e, f"Unable to upload instance types data to the S3 bucket {self.bucket.name} due to exception: {e}"
559560
)
560561

562+
def _upload_run_instances_overrides(self):
563+
"""Upload run_instances_overrides.json to the cluster S3 bucket."""
564+
try:
565+
overrides = self.config.get_run_instances_overrides()
566+
LOGGER.info("Uploading run_instances_overrides.json to S3...")
567+
result = self.bucket.upload_config(
568+
config=overrides,
569+
config_name=PCLUSTER_S3_ARTIFACTS_DICT.get("run_instances_overrides_name"),
570+
format=S3FileFormat.JSON,
571+
)
572+
self.config.run_instances_overrides_version = result.get("VersionId")
573+
LOGGER.info("run_instances_overrides.json uploaded successfully.")
574+
except Exception as e:
575+
raise _cluster_error_mapper(
576+
e,
577+
f"Unable to upload run_instances_overrides.json to the S3 bucket {self.bucket.name} "
578+
f"due to exception: {e}",
579+
)
580+
561581
def _upload_change_set(self, changes=None):
562582
"""Upload change set."""
563583
if changes:
@@ -924,6 +944,7 @@ def update(
924944
self._add_tags()
925945
self._upload_config()
926946
self._upload_instance_types_data()
947+
self._upload_run_instances_overrides()
927948
self._upload_change_set(changes)
928949

929950
# Create template if not provided by the user

cli/src/pcluster/schemas/cluster_schema.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
Image,
6565
Imds,
6666
IntelSoftware,
67+
LaunchTemplateOverrides,
6768
LocalStorage,
6869
LoginNodes,
6970
LoginNodesIam,
@@ -1536,6 +1537,25 @@ def make_resource(self, data, **kwargs):
15361537
return BaseTag(**data)
15371538

15381539

1540+
class LaunchTemplateOverridesSchema(BaseSchema):
1541+
"""Represent the schema of the LaunchTemplateOverrides section."""
1542+
1543+
launch_template_id = fields.Str(
1544+
required=True,
1545+
validate=validate.Regexp(r"^lt-[a-zA-Z0-9]+$"),
1546+
metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY},
1547+
)
1548+
version = fields.Int(
1549+
required=True,
1550+
metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY},
1551+
)
1552+
1553+
@post_load
1554+
def make_resource(self, data, **kwargs):
1555+
"""Generate resource."""
1556+
return LaunchTemplateOverrides(**data)
1557+
1558+
15391559
class SlurmComputeResourceSchema(_ComputeResourceSchema):
15401560
"""Represent the schema of the Slurm ComputeResource."""
15411561

@@ -1576,6 +1596,9 @@ class SlurmComputeResourceSchema(_ComputeResourceSchema):
15761596
validate=validate.Range(min=MIN_SLURM_NODE_PRIORITY, max=MAX_SLURM_NODE_PRIORITY),
15771597
metadata={"update_policy": UpdatePolicy.SUPPORTED},
15781598
)
1599+
launch_template_overrides = fields.Nested(
1600+
LaunchTemplateOverridesSchema, metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}
1601+
)
15791602

15801603
@validates_schema
15811604
def no_coexist_instance_type_flexibility(self, data, **kwargs):

cli/src/pcluster/templates/cluster_stack.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1419,10 +1419,13 @@ def _add_head_node(self):
14191419
),
14201420
"cluster_config_version": self.config.config_version,
14211421
"instance_types_data_version": self.config.instance_types_data_version,
1422+
"run_instances_overrides_version": self.config.run_instances_overrides_version,
14221423
"change_set_s3_key": f"{self.bucket.artifact_directory}/configs/"
14231424
f"{PCLUSTER_S3_ARTIFACTS_DICT.get('change_set_name')}",
14241425
"instance_types_data_s3_key": f"{self.bucket.artifact_directory}/configs/"
14251426
f"{PCLUSTER_S3_ARTIFACTS_DICT.get('instance_types_data_name')}",
1427+
"run_instances_overrides_s3_key": f"{self.bucket.artifact_directory}/configs/"
1428+
f"{PCLUSTER_S3_ARTIFACTS_DICT.get('run_instances_overrides_name')}",
14261429
"custom_node_package": self.config.custom_node_package or "",
14271430
"custom_awsbatchcli_package": self.config.custom_aws_batch_cli_package or "",
14281431
"head_node_imds_secured": str(self.config.head_node.imds.secured).lower(),

cli/src/pcluster/validators/ec2_validators.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -870,3 +870,53 @@ def _validate(self, cluster_ultraserver_capacity_block_dict):
870870
f"The following capacity blocks have invalid block sizes: {'; '.join(invalid_capacity_blocks)}.",
871871
FailureLevel.ERROR,
872872
)
873+
874+
875+
class LaunchTemplateOverridesValidator(Validator):
876+
"""Validate the launch template overrides configuration."""
877+
878+
def _validate(self, launch_template_id, version, instance_types, max_network_cards, is_flexible):
879+
try:
880+
lt_data = AWSApi.instance().ec2.describe_launch_template_version(launch_template_id, str(version))
881+
except AWSClientError as e:
882+
self._add_failure(
883+
f"Unable to retrieve launch template {launch_template_id} version {version}. {str(e)}",
884+
FailureLevel.ERROR,
885+
)
886+
return
887+
888+
# Check for properties not in allow list
889+
allow_list = {"InstanceType", "NetworkInterfaces"}
890+
denied_found = [prop for prop in lt_data if prop not in allow_list]
891+
if denied_found:
892+
self._add_failure(
893+
f"Launch template {launch_template_id} contains unsupported properties: "
894+
f"{', '.join(sorted(denied_found))}. Only NetworkInterfaces, InstanceType, and MetadataOptions "
895+
f"are supported in the override launch template.",
896+
FailureLevel.WARNING,
897+
)
898+
899+
# Validate network interface count does not exceed max supported
900+
network_interfaces = lt_data.get("NetworkInterfaces", [])
901+
if network_interfaces and len(network_interfaces) > max_network_cards:
902+
self._add_failure(
903+
f"Launch template {launch_template_id} configures {len(network_interfaces)} network interfaces, "
904+
f"but the instance type supports a maximum of {max_network_cards}.",
905+
FailureLevel.ERROR,
906+
)
907+
908+
# Validate instance type in LT matches the compute resource if specified
909+
lt_instance_type = lt_data.get("InstanceType")
910+
if lt_instance_type and lt_instance_type not in instance_types:
911+
self._add_failure(
912+
f"Instance type '{lt_instance_type}' in launch template {launch_template_id} does not match "
913+
f"the compute resource instance type(s): {', '.join(instance_types)}.",
914+
FailureLevel.ERROR,
915+
)
916+
917+
# Warn if used with flexible instance types
918+
if is_flexible:
919+
self._add_failure(
920+
"LaunchTemplateOverrides cannot be used with flexible instance types.",
921+
FailureLevel.ERROR,
922+
)

cli/tests/pcluster/example_configs/slurm.full.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,9 @@ Scheduling:
180180
HttpProxyAddress: https://proxy-address:port
181181
ComputeResources:
182182
- Name: compute-resource-1
183+
LaunchTemplateOverrides:
184+
LaunchTemplateId: lt-0ab6123b7f1111111
185+
Version: "2"
183186
InstanceType: c4.2xlarge
184187
- Name: compute-resource-2
185188
InstanceType: c5.2xlarge

0 commit comments

Comments
 (0)