Skip to content

Commit 617eda4

Browse files
launch template override
1 parent 28f3ef1 commit 617eda4

8 files changed

Lines changed: 195 additions & 0 deletions

File tree

cli/src/pcluster/aws/ec2.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,22 @@ def describe_image(self, ami_id):
162162
return ImageInfo(images[0])
163163
raise AWSClientError(function_name="describe_images", message=f"Image {ami_id} not found")
164164

165+
@AWSExceptionHandler.handle_client_exception
166+
@Cache.cached
167+
def describe_launch_template_version(self, launch_template_id, version):
168+
"""Describe a specific launch template version and return its LaunchTemplateData."""
169+
response = self._client.describe_launch_template_versions(
170+
LaunchTemplateId=launch_template_id,
171+
Versions=[str(version)],
172+
)
173+
versions = response.get("LaunchTemplateVersions", [])
174+
if not versions:
175+
raise AWSClientError(
176+
function_name="describe_launch_template_versions",
177+
message=f"Launch template {launch_template_id} version {version} not found",
178+
)
179+
return versions[0].get("LaunchTemplateData", {})
180+
165181
@AWSExceptionHandler.handle_client_exception
166182
@Cache.cached
167183
def describe_images(self, ami_ids, filters, owners):

cli/src/pcluster/config/cluster_config.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@
158158
InstanceTypePlacementGroupValidator,
159159
InstanceTypeValidator,
160160
KeyPairValidator,
161+
LaunchTemplateOverridesValidator,
161162
PlacementGroupCapacityReservationValidator,
162163
PlacementGroupCapacityTypeValidator,
163164
PlacementGroupNamingValidator,
@@ -1631,6 +1632,7 @@ def __init__(
16311632
self.managed_head_node_security_group = None
16321633
self.managed_compute_security_group = None
16331634
self.instance_types_data_version = ""
1635+
self.run_instances_overrides_version = ""
16341636

16351637
def _register_validators(self, context: ValidatorContext = None): # noqa: D102 #pylint: disable=unused-argument
16361638
self._register_validator(RegionValidator, region=self.region)
@@ -2222,6 +2224,15 @@ def scheduler_resources(self):
22222224
return str(files(__package__).parent / "resources" / "batch")
22232225

22242226

2227+
class LaunchTemplateOverrides(Resource):
2228+
"""Represent the LaunchTemplateOverrides configuration for a compute resource."""
2229+
2230+
def __init__(self, launch_template_id: str = None, version: int = None, **kwargs):
2231+
super().__init__(**kwargs)
2232+
self.launch_template_id = Resource.init_param(launch_template_id)
2233+
self.version = Resource.init_param(version)
2234+
2235+
22252236
class _BaseSlurmComputeResource(BaseComputeResource):
22262237
"""Represent the Slurm Compute Resource."""
22272238

@@ -2240,6 +2251,7 @@ def __init__(
22402251
tags: List[Tag] = None,
22412252
static_node_priority: int = None,
22422253
dynamic_node_priority: int = None,
2254+
launch_template_overrides=None,
22432255
**kwargs,
22442256
):
22452257
super().__init__(**kwargs)
@@ -2260,6 +2272,7 @@ def __init__(
22602272
self.tags = tags
22612273
self.static_node_priority = Resource.init_param(static_node_priority, default=1)
22622274
self.dynamic_node_priority = Resource.init_param(dynamic_node_priority, default=1000)
2275+
self.launch_template_overrides = launch_template_overrides
22632276

22642277
@abstractmethod
22652278
def is_flexible(self) -> bool:
@@ -2362,6 +2375,15 @@ def _register_validators(self, context: ValidatorContext = None):
23622375
ec2memory=min_memory,
23632376
instance_type=smallest_type,
23642377
)
2378+
if self.launch_template_overrides:
2379+
self._register_validator(
2380+
LaunchTemplateOverridesValidator,
2381+
launch_template_id=self.launch_template_overrides.launch_template_id,
2382+
version=self.launch_template_overrides.version,
2383+
instance_types=self.instance_types,
2384+
max_network_cards=self.max_network_cards,
2385+
is_flexible=self.is_flexible(),
2386+
)
23652387

23662388
def is_flexible(self):
23672389
"""Return True because the ComputeResource can contain multiple instance types."""
@@ -2449,6 +2471,15 @@ def _register_validators(self, context: ValidatorContext = None):
24492471
ec2memory=self._instance_type_info.ec2memory_size_in_mib(),
24502472
instance_type=self.instance_type,
24512473
)
2474+
if self.launch_template_overrides:
2475+
self._register_validator(
2476+
LaunchTemplateOverridesValidator,
2477+
launch_template_id=self.launch_template_overrides.launch_template_id,
2478+
version=self.launch_template_overrides.version,
2479+
instance_types=self.instance_types,
2480+
max_network_cards=self.max_network_cards,
2481+
is_flexible=self.is_flexible(),
2482+
)
24522483

24532484
@property
24542485
def architecture(self) -> str:
@@ -2975,6 +3006,40 @@ def get_instance_types_data(self):
29753006
result[instance_type] = instance_type_info.instance_type_data
29763007
return result
29773008

3009+
def get_run_instances_overrides(self):
3010+
"""
3011+
Build run_instances_overrides data from LaunchTemplateOverrides config.
3012+
3013+
Iterates all queues and compute resources. For each compute resource that has
3014+
launch_template_overrides configured, fetches the launch template data.
3015+
3016+
Returns a dict keyed by {queue_name} -> {compute_resource_name} -> {launch_template_data}.
3017+
Returns empty dict if no overrides are configured.
3018+
"""
3019+
overrides = {}
3020+
for queue in self.scheduling.queues:
3021+
for compute_resource in queue.compute_resources:
3022+
if not getattr(compute_resource, "launch_template_overrides", None):
3023+
continue
3024+
3025+
lt_overrides = compute_resource.launch_template_overrides
3026+
lt_id = lt_overrides.launch_template_id
3027+
lt_version = lt_overrides.version
3028+
3029+
LOGGER.info(
3030+
"Fetching launch template %s version %s for queue %s, compute resource %s",
3031+
lt_id,
3032+
lt_version,
3033+
queue.name,
3034+
compute_resource.name,
3035+
)
3036+
lt_data = AWSApi.instance().ec2.describe_launch_template_version(lt_id, lt_version)
3037+
3038+
if lt_data:
3039+
overrides.setdefault(queue.name, {})[compute_resource.name] = lt_data
3040+
3041+
return overrides
3042+
29783043
@property
29793044
def login_nodes_ami(self):
29803045
"""Get the image id of the LoginNodes."""

cli/src/pcluster/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@
232232
"custom_artifacts_name": "artifacts.zip",
233233
"scheduler_resources_name": "scheduler_resources.zip",
234234
"change_set_name": "change-set.json",
235+
"run_instances_overrides_name": "run_instances_overrides.json",
235236
}
236237

237238
PCLUSTER_TAG_VALUE_REGEX = r"^([\w\+\-\=\.\_\:\@/]{0,256})$"

cli/src/pcluster/models/cluster.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,7 @@ def create(
374374
artifact_dir_generated = True
375375
self._upload_config()
376376
self._upload_instance_types_data()
377+
self._upload_run_instances_overrides()
377378
LOGGER.info("Generation and upload completed successfully")
378379

379380
# Create template if not provided by the user
@@ -558,6 +559,25 @@ def _upload_instance_types_data(self):
558559
e, f"Unable to upload instance types data to the S3 bucket {self.bucket.name} due to exception: {e}"
559560
)
560561

562+
def _upload_run_instances_overrides(self):
563+
"""Upload run_instances_overrides.json to the cluster S3 bucket."""
564+
try:
565+
overrides = self.config.get_run_instances_overrides()
566+
LOGGER.info("Uploading run_instances_overrides.json to S3...")
567+
result = self.bucket.upload_config(
568+
config=overrides,
569+
config_name=PCLUSTER_S3_ARTIFACTS_DICT.get("run_instances_overrides_name"),
570+
format=S3FileFormat.JSON,
571+
)
572+
self.config.run_instances_overrides_version = result.get("VersionId")
573+
LOGGER.info("run_instances_overrides.json uploaded successfully.")
574+
except Exception as e:
575+
raise _cluster_error_mapper(
576+
e,
577+
f"Unable to upload run_instances_overrides.json to the S3 bucket {self.bucket.name} "
578+
f"due to exception: {e}",
579+
)
580+
561581
def _upload_change_set(self, changes=None):
562582
"""Upload change set."""
563583
if changes:
@@ -924,6 +944,7 @@ def update(
924944
self._add_tags()
925945
self._upload_config()
926946
self._upload_instance_types_data()
947+
self._upload_run_instances_overrides()
927948
self._upload_change_set(changes)
928949

929950
# Create template if not provided by the user

cli/src/pcluster/schemas/cluster_schema.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
Image,
6565
Imds,
6666
IntelSoftware,
67+
LaunchTemplateOverrides,
6768
LocalStorage,
6869
LoginNodes,
6970
LoginNodesIam,
@@ -1536,6 +1537,25 @@ def make_resource(self, data, **kwargs):
15361537
return BaseTag(**data)
15371538

15381539

1540+
class LaunchTemplateOverridesSchema(BaseSchema):
1541+
"""Represent the schema of the LaunchTemplateOverrides section."""
1542+
1543+
launch_template_id = fields.Str(
1544+
required=True,
1545+
validate=validate.Regexp(r"^lt-[a-zA-Z0-9]+$"),
1546+
metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY},
1547+
)
1548+
version = fields.Int(
1549+
required=True,
1550+
metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY},
1551+
)
1552+
1553+
@post_load
1554+
def make_resource(self, data, **kwargs):
1555+
"""Generate resource."""
1556+
return LaunchTemplateOverrides(**data)
1557+
1558+
15391559
class SlurmComputeResourceSchema(_ComputeResourceSchema):
15401560
"""Represent the schema of the Slurm ComputeResource."""
15411561

@@ -1576,6 +1596,9 @@ class SlurmComputeResourceSchema(_ComputeResourceSchema):
15761596
validate=validate.Range(min=MIN_SLURM_NODE_PRIORITY, max=MAX_SLURM_NODE_PRIORITY),
15771597
metadata={"update_policy": UpdatePolicy.SUPPORTED},
15781598
)
1599+
launch_template_overrides = fields.Nested(
1600+
LaunchTemplateOverridesSchema, metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}
1601+
)
15791602

15801603
@validates_schema
15811604
def no_coexist_instance_type_flexibility(self, data, **kwargs):

cli/src/pcluster/templates/cluster_stack.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1419,10 +1419,13 @@ def _add_head_node(self):
14191419
),
14201420
"cluster_config_version": self.config.config_version,
14211421
"instance_types_data_version": self.config.instance_types_data_version,
1422+
"run_instances_overrides_version": self.config.run_instances_overrides_version,
14221423
"change_set_s3_key": f"{self.bucket.artifact_directory}/configs/"
14231424
f"{PCLUSTER_S3_ARTIFACTS_DICT.get('change_set_name')}",
14241425
"instance_types_data_s3_key": f"{self.bucket.artifact_directory}/configs/"
14251426
f"{PCLUSTER_S3_ARTIFACTS_DICT.get('instance_types_data_name')}",
1427+
"run_instances_overrides_s3_key": f"{self.bucket.artifact_directory}/configs/"
1428+
f"{PCLUSTER_S3_ARTIFACTS_DICT.get('run_instances_overrides_name')}",
14261429
"custom_node_package": self.config.custom_node_package or "",
14271430
"custom_awsbatchcli_package": self.config.custom_aws_batch_cli_package or "",
14281431
"head_node_imds_secured": str(self.config.head_node.imds.secured).lower(),

cli/src/pcluster/validators/ec2_validators.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -870,3 +870,66 @@ def _validate(self, cluster_ultraserver_capacity_block_dict):
870870
f"The following capacity blocks have invalid block sizes: {'; '.join(invalid_capacity_blocks)}.",
871871
FailureLevel.ERROR,
872872
)
873+
874+
875+
class LaunchTemplateOverridesValidator(Validator):
876+
"""Validate the launch template overrides configuration."""
877+
878+
# Deny-listed launch template properties that are known to break the cluster.
879+
_DENY_LISTED_PROPERTIES = {
880+
"UserData",
881+
"IamInstanceProfile",
882+
"TagSpecifications",
883+
"BlockDeviceMappings",
884+
"ImageId",
885+
"Monitoring",
886+
"InstanceInitiatedShutdownBehavior",
887+
"CapacityReservationSpecification",
888+
"InstanceMarketOptions",
889+
"EbsOptimized",
890+
}
891+
892+
def _validate(self, launch_template_id, version, instance_types, max_network_cards, is_flexible):
893+
try:
894+
lt_data = AWSApi.instance().ec2.describe_launch_template_version(launch_template_id, str(version))
895+
except AWSClientError as e:
896+
self._add_failure(
897+
f"Unable to retrieve launch template {launch_template_id} version {version}. {str(e)}",
898+
FailureLevel.ERROR,
899+
)
900+
return
901+
902+
# Check for deny-listed properties
903+
denied_found = [prop for prop in self._DENY_LISTED_PROPERTIES if prop in lt_data]
904+
if denied_found:
905+
self._add_failure(
906+
f"Launch template {launch_template_id} contains unsupported properties: "
907+
f"{', '.join(sorted(denied_found))}. Only NetworkInterfaces, InstanceType, and MetadataOptions "
908+
f"are supported in the override launch template.",
909+
FailureLevel.WARNING,
910+
)
911+
912+
# Validate network interface count does not exceed max supported
913+
network_interfaces = lt_data.get("NetworkInterfaces", [])
914+
if network_interfaces and len(network_interfaces) > max_network_cards:
915+
self._add_failure(
916+
f"Launch template {launch_template_id} configures {len(network_interfaces)} network interfaces, "
917+
f"but the instance type supports a maximum of {max_network_cards}.",
918+
FailureLevel.ERROR,
919+
)
920+
921+
# Validate instance type in LT matches the compute resource if specified
922+
lt_instance_type = lt_data.get("InstanceType")
923+
if lt_instance_type and lt_instance_type not in instance_types:
924+
self._add_failure(
925+
f"Instance type '{lt_instance_type}' in launch template {launch_template_id} does not match "
926+
f"the compute resource instance type(s): {', '.join(instance_types)}.",
927+
FailureLevel.ERROR,
928+
)
929+
930+
# Warn if used with flexible instance types
931+
if is_flexible:
932+
self._add_failure(
933+
"LaunchTemplateOverrides cannot be used with flexible instance types.",
934+
FailureLevel.ERROR,
935+
)

cli/tests/pcluster/example_configs/slurm.full.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,9 @@ Scheduling:
180180
HttpProxyAddress: https://proxy-address:port
181181
ComputeResources:
182182
- Name: compute-resource-1
183+
LaunchTemplateOverrides:
184+
LaunchTemplateId: lt-0ab6123b7f1111111
185+
Version: "2"
183186
InstanceType: c4.2xlarge
184187
- Name: compute-resource-2
185188
InstanceType: c5.2xlarge

0 commit comments

Comments
 (0)