Skip to content

Commit f648a81

Browse files
committed
[Observability] Use metric filter to generate the clustermgtd heartbeat metric.
1 parent e7cb910 commit f648a81

6 files changed

Lines changed: 76 additions & 11 deletions

File tree

cli/src/pcluster/constants.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,12 @@
188188
CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT = 1
189189
DETAILED_MONITORING_ENABLED_DEFAULT = False
190190

191+
# CloudWatch Metrics
192+
CW_METRICS_NAMESPACE = "ParallelCluster"
193+
CW_METRICS_DIMENSION_CLUSTER_NAME = "ClusterName"
194+
CW_METRICS_DIMENSION_INSTANCE_ID = "InstanceId"
195+
CW_METRICS_CLUSTERMGTD_HEARTBEAT = "ClustermgtdHeartbeat"
196+
191197
STACK_EVENTS_LOG_STREAM_NAME_FORMAT = "{}-cfn-events"
192198

193199
PCLUSTER_IMAGE_NAME_REGEX = r"^[-_A-Za-z0-9{][-_A-Za-z0-9\s:{}\.]+[-_A-Za-z0-9}]$"

cli/src/pcluster/templates/cluster_stack.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
CW_LOGS_CFN_PARAM_NAME,
6767
CW_METRICS_CLUSTERMGTD_HEARTBEAT,
6868
CW_METRICS_DIMENSION_CLUSTER_NAME,
69+
CW_METRICS_DIMENSION_INSTANCE_ID,
6970
CW_METRICS_NAMESPACE,
7071
DEFAULT_EPHEMERAL_DIR,
7172
EFS_PORT,
@@ -384,8 +385,34 @@ def _add_head_node_alarms(self):
384385
},
385386
}
386387

387-
if self._condition_is_slurm():
388-
metrics_for_alarms["ClustermgtdHeartbeat"] = {
388+
# These alarms required Cw logging enabled because they are based on CW Metrics Filters.
389+
if self._condition_is_slurm() and self.config.is_cw_logging_enabled:
390+
# Create metric filter to extract heartbeat metric from clustermgtd event logs
391+
clustermgtd_heartbeat_metric_filter = logs.CfnMetricFilter(
392+
scope=self.stack,
393+
id=f"{CW_METRICS_CLUSTERMGTD_HEARTBEAT}Filter",
394+
filter_pattern='{ $.event-type = "clustermgtd-heartbeat" }',
395+
log_group_name=self.log_group_name,
396+
metric_transformations=[
397+
logs.CfnMetricFilter.MetricTransformationProperty(
398+
metric_namespace=CW_METRICS_NAMESPACE,
399+
metric_name=CW_METRICS_CLUSTERMGTD_HEARTBEAT,
400+
metric_value="1",
401+
unit="Count",
402+
dimensions=[
403+
logs.CfnMetricFilter.DimensionProperty(
404+
key=CW_METRICS_DIMENSION_CLUSTER_NAME, value="$.cluster-name"
405+
),
406+
logs.CfnMetricFilter.DimensionProperty(
407+
key=CW_METRICS_DIMENSION_INSTANCE_ID, value="$.instance-id"
408+
),
409+
],
410+
)
411+
],
412+
)
413+
clustermgtd_heartbeat_metric_filter.add_depends_on(self.log_group)
414+
415+
metrics_for_alarms[CW_METRICS_CLUSTERMGTD_HEARTBEAT] = {
389416
"metric": self._cw_metric_head_node(
390417
CW_METRICS_NAMESPACE,
391418
CW_METRICS_CLUSTERMGTD_HEARTBEAT,

cli/src/pcluster/templates/cw_dashboard_builder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -574,7 +574,7 @@ def _add_head_node_instance_metrics_graphs(self):
574574

575575
# Custom Metrics
576576
pcluster_metrics = []
577-
if self.config.scheduling.scheduler == "slurm":
577+
if self.config.scheduling.scheduler == "slurm" and self.config.is_cw_logging_enabled:
578578
pcluster_metrics.append(
579579
new_pcluster_metric(
580580
title="Daemons Heartbeats",
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
Image:
2+
Os: alinux2
3+
HeadNode:
4+
InstanceType: t3.micro
5+
Networking:
6+
SubnetId: subnet-12345678
7+
Ssh:
8+
KeyName: ec2-key-name
9+
Scheduling:
10+
Scheduler: slurm
11+
SlurmQueues:
12+
- Name: queue1
13+
Networking:
14+
SubnetIds:
15+
- subnet-12345678
16+
ComputeResources:
17+
- Name: compute-resource1
18+
InstanceType: c5.2xlarge
19+
Monitoring:
20+
Logs:
21+
CloudWatch:
22+
Enabled: false

cli/tests/pcluster/templates/test_cluster_stack.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ def test_add_efs_shared_storage(mocker, test_datadir, config_file_name, expected
250250
[
251251
"slurm.required.yaml",
252252
"slurm.full.yaml",
253+
"slurm.logging_disabled.yaml",
253254
"awsbatch.simple.yaml",
254255
"awsbatch.full.yaml",
255256
],
@@ -315,7 +316,7 @@ def test_add_alarms(mocker, config_file_name):
315316
},
316317
}
317318

318-
if cluster.scheduling.scheduler == "slurm":
319+
if cluster.scheduling.scheduler == "slurm" and cluster.is_cw_logging_enabled:
319320
expected_alarms["Clustermgtd-Heartbeat"] = {
320321
"name": "clustername-HeadNode-ClustermgtdHeartbeat",
321322
"metric_name": "ClustermgtdHeartbeat",

cli/tests/pcluster/templates/test_cw_dashboard_builder.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,9 @@ def test_cw_dashboard_builder(mocker, test_datadir, set_env, config_file_name, r
5858
if cluster_config.is_cw_dashboard_enabled:
5959
assert_that(output_yaml).contains("CloudwatchDashboard")
6060
assert_that(output_yaml).contains("Head Node EC2 Metrics")
61-
_verify_head_node_instance_metrics_graphs(output_yaml, cluster_config.scheduling.scheduler)
61+
_verify_head_node_instance_metrics_graphs(
62+
output_yaml, cluster_config.scheduling.scheduler, cluster_config.is_cw_logging_enabled
63+
)
6264

6365
if cluster_config.are_alarms_enabled:
6466
assert_that(output_yaml).contains("Cluster Alarms")
@@ -79,15 +81,20 @@ def test_cw_dashboard_builder(mocker, test_datadir, set_env, config_file_name, r
7981
assert_that(output_yaml).does_not_contain("CloudwatchDashboard")
8082
assert_that(output_yaml).does_not_contain("Head Node EC2 Metrics")
8183

82-
_verify_alarms(output_yaml, cluster_config.are_alarms_enabled, cluster_config.scheduling.scheduler)
84+
_verify_alarms(
85+
output_yaml,
86+
cluster_config.are_alarms_enabled,
87+
cluster_config.scheduling.scheduler,
88+
cluster_config.is_cw_logging_enabled,
89+
)
8390

8491
if cluster_config.is_cw_logging_enabled:
8592
assert_that(output_yaml).contains("ClusterCWLogGroup")
8693
else:
8794
assert_that(output_yaml).does_not_contain("ClusterCWLogGroup")
8895

8996

90-
def _verify_alarms(output_yaml, alarms_enabled, scheduler):
97+
def _verify_alarms(output_yaml, alarms_enabled, scheduler, is_cw_logging_enabled):
9198
if alarms_enabled:
9299
assert_that(output_yaml).contains("HeadNodeHealthAlarm")
93100
assert_that(output_yaml).contains("StatusCheckFailed")
@@ -102,7 +109,7 @@ def _verify_alarms(output_yaml, alarms_enabled, scheduler):
102109
assert_that(output_yaml).contains("disk_used_percent")
103110

104111
# ClustermgtdHeartbeat alarm is only created for Slurm scheduler
105-
if scheduler == "slurm":
112+
if scheduler == "slurm" and is_cw_logging_enabled:
106113
assert_that(output_yaml).contains("HeadNodeClustermgtdHeartbeatAlarm")
107114
assert_that(output_yaml).contains("ClustermgtdHeartbeat")
108115
else:
@@ -130,13 +137,15 @@ def _verify_metric_filter_dimensions(metric_filters):
130137
)
131138

132139
expected_dimensions = [{"Key": "ClusterName", "Value": "$.cluster-name"}]
140+
if name == "ClustermgtdHeartbeatFilter":
141+
expected_dimensions.append({"Key": "InstanceId", "Value": "$.instance-id"})
133142

134143
assert_that(dimensions, description=f"{name} should have dimensions {expected_dimensions}").is_equal_to(
135144
expected_dimensions
136145
)
137146

138147

139-
def _verify_head_node_instance_metrics_graphs(output_yaml, scheduler):
148+
def _verify_head_node_instance_metrics_graphs(output_yaml, scheduler, is_cw_logging_enabled):
140149
"""Verify CloudWatch graphs within the Head Node Instance Metrics section."""
141150
assert_that(output_yaml).contains("Head Node Instance Metrics")
142151
assert_that(output_yaml).contains("CPU Utilization")
@@ -146,8 +155,8 @@ def _verify_head_node_instance_metrics_graphs(output_yaml, scheduler):
146155
assert_that(output_yaml).contains("Disk Read/Write Ops")
147156
assert_that(output_yaml).contains("Disk Used Percent")
148157
assert_that(output_yaml).contains("Memory Used Percent")
149-
# Daemons Heartbeats widget is only created for Slurm scheduler
150-
if scheduler == "slurm":
158+
# Daemons Heartbeats widget is only created for Slurm scheduler with logging enabled
159+
if scheduler == "slurm" and is_cw_logging_enabled:
151160
assert_that(output_yaml).contains("Daemons Heartbeats")
152161
assert_that(output_yaml).contains("ClustermgtdHeartbeat")
153162
else:

0 commit comments

Comments
 (0)