Skip to content

Commit 91d71d1

Browse files
committed
[Observability] Add alarm on missing clustermgtd heartbeat.
1 parent 4df799d commit 91d71d1

5 files changed

Lines changed: 88 additions & 30 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ CHANGELOG
99
- Upgrade jmespath to ~=1.0 (from ~=0.10).
1010
- Upgrade tabulate to <=0.9.0 (from <=0.8.10).
1111
- Add permission `cloudwatch:PutMetricData` to the head node policy so that clustermgtd is able to emit metrics.
12+
- Add alarm on missing clustermgtd heartbeat.
1213

1314
**BUG FIXES**
1415
- Add validation to block updates that change tag order. Blocking such change prevents update failures.

cli/src/pcluster/templates/cluster_stack.py

Lines changed: 51 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@
6464
CW_ALARM_PERIOD_DEFAULT,
6565
CW_LOG_GROUP_NAME_PREFIX,
6666
CW_LOGS_CFN_PARAM_NAME,
67+
CW_METRICS_CLUSTERMGTD_HEARTBEAT,
68+
CW_METRICS_DIMENSION_CLUSTER_NAME,
69+
CW_METRICS_NAMESPACE,
6770
DEFAULT_EPHEMERAL_DIR,
6871
EFS_PORT,
6972
FSX_PORTS,
@@ -364,38 +367,62 @@ def _cw_metric_head_node(
364367
def _add_head_node_alarms(self):
365368
self.head_node_alarms = []
366369

370+
# Metric-specific configurations (only specify overrides from defaults)
367371
metrics_for_alarms = {
368-
"Health": self._cw_metric_head_node("AWS/EC2", "StatusCheckFailed"),
369-
"Cpu": self._cw_metric_head_node("AWS/EC2", "CPUUtilization"),
370-
"Mem": self._cw_metric_head_node("CWAgent", "mem_used_percent"),
371-
"Disk": self._cw_metric_head_node("CWAgent", "disk_used_percent", extra_dimensions={"path": "/"}),
372+
"Health": {
373+
"metric": self._cw_metric_head_node("AWS/EC2", "StatusCheckFailed"),
374+
"threshold": 0,
375+
},
376+
"Cpu": {
377+
"metric": self._cw_metric_head_node("AWS/EC2", "CPUUtilization"),
378+
},
379+
"Mem": {
380+
"metric": self._cw_metric_head_node("CWAgent", "mem_used_percent"),
381+
},
382+
"Disk": {
383+
"metric": self._cw_metric_head_node("CWAgent", "disk_used_percent", extra_dimensions={"path": "/"}),
384+
},
385+
"ClustermgtdHeartbeat": {
386+
"metric": self._cw_metric_head_node(
387+
CW_METRICS_NAMESPACE,
388+
CW_METRICS_CLUSTERMGTD_HEARTBEAT,
389+
extra_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name},
390+
),
391+
"evaluation_periods": 10,
392+
"datapoints_to_alarm": 10,
393+
"comparison_operator": cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD,
394+
"threshold": 1,
395+
"treat_missing_data": cloudwatch.TreatMissingData.BREACHING,
396+
},
372397
}
373398

374-
for metric_key, metric in metrics_for_alarms.items():
399+
for metric_key, alarm_config in metrics_for_alarms.items():
375400
alarm_id = f"HeadNode{metric_key}Alarm"
376401
alarm_name = f"{self.stack.stack_name}-HeadNode-{metric_key}"
377-
threshold = 0 if metric_key == "Health" else CW_ALARM_PERCENT_THRESHOLD_DEFAULT
378-
self.head_node_alarms.append(
379-
cloudwatch.Alarm(
380-
scope=self.stack,
381-
id=alarm_id,
382-
alarm_name=alarm_name,
383-
metric=metric,
384-
evaluation_periods=CW_ALARM_EVALUATION_PERIODS_DEFAULT,
385-
threshold=threshold,
386-
comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
387-
datapoints_to_alarm=CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT,
388-
)
389-
)
390-
391-
self.head_node_alarms.append(
392-
cloudwatch.CompositeAlarm(
402+
alarm = cloudwatch.Alarm(
393403
scope=self.stack,
394-
id="HeadNodeAlarm",
395-
composite_alarm_name=f"{self.stack.stack_name}-HeadNode",
396-
alarm_rule=cloudwatch.AlarmRule.any_of(*self.head_node_alarms),
404+
id=alarm_id,
405+
alarm_name=alarm_name,
406+
metric=alarm_config["metric"],
407+
evaluation_periods=alarm_config.get("evaluation_periods", CW_ALARM_EVALUATION_PERIODS_DEFAULT),
408+
threshold=alarm_config.get("threshold", CW_ALARM_PERCENT_THRESHOLD_DEFAULT),
409+
comparison_operator=alarm_config.get(
410+
"comparison_operator", cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD
411+
),
412+
datapoints_to_alarm=alarm_config.get("datapoints_to_alarm", CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT),
413+
treat_missing_data=alarm_config.get("treat_missing_data", cloudwatch.TreatMissingData.MISSING),
397414
)
415+
alarm.node.add_dependency(self.wait_condition)
416+
self.head_node_alarms.append(alarm)
417+
418+
composite_alarm = cloudwatch.CompositeAlarm(
419+
scope=self.stack,
420+
id="HeadNodeAlarm",
421+
composite_alarm_name=f"{self.stack.stack_name}-HeadNode",
422+
alarm_rule=cloudwatch.AlarmRule.any_of(*self.head_node_alarms),
398423
)
424+
composite_alarm.node.add_dependency(self.wait_condition)
425+
self.head_node_alarms.append(composite_alarm)
399426

400427
def _add_iam_resources(self):
401428
head_node_iam_resources = HeadNodeIamResources(

cli/src/pcluster/templates/cw_dashboard_builder.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,12 @@
1919

2020
from pcluster.config.cluster_config import BaseClusterConfig, ExistingFileCache, SharedFsxLustre
2121
from pcluster.config.common import SharedStorageType
22-
from pcluster.constants import Feature
22+
from pcluster.constants import (
23+
CW_METRICS_CLUSTERMGTD_HEARTBEAT,
24+
CW_METRICS_DIMENSION_CLUSTER_NAME,
25+
CW_METRICS_NAMESPACE,
26+
Feature,
27+
)
2328
from pcluster.utils import is_feature_supported
2429

2530
MAX_WIDTH = 24
@@ -567,9 +572,19 @@ def _add_head_node_instance_metrics_graphs(self):
567572
new_pcluster_metric(title="Memory Used Percent", metrics=["mem_used_percent"], namespace="CWAgent"),
568573
]
569574

575+
# Custom Metrics
576+
pcluster_metrics = [
577+
new_pcluster_metric(
578+
title="Daemons Heartbeats",
579+
metrics=[CW_METRICS_CLUSTERMGTD_HEARTBEAT],
580+
namespace=CW_METRICS_NAMESPACE,
581+
additional_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name},
582+
),
583+
]
584+
570585
# Create graphs for EC2 metrics and CW Agent metrics and update coordinates
571586
widgets_list = []
572-
for metrics_param in ec2_metrics + cwagent_metrics:
587+
for metrics_param in ec2_metrics + cwagent_metrics + pcluster_metrics:
573588
metrics_list = self._generate_metrics_list(metrics_param)
574589
graph_widget = self._generate_graph_widget(metrics_param.title, metrics_list)
575590
widgets_list.append(graph_widget)

cli/tests/pcluster/templates/test_cluster_stack.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ def test_add_alarms(mocker, config_file_name):
281281
"comparison_operator": "GreaterThanThreshold",
282282
"evaluation_periods": 1,
283283
"datapoints_to_alarm": 1,
284-
"treat_missing_data": None,
284+
"treat_missing_data": "missing",
285285
},
286286
"Cpu": {
287287
"name": "clustername-HeadNode-Cpu",
@@ -291,7 +291,7 @@ def test_add_alarms(mocker, config_file_name):
291291
"comparison_operator": "GreaterThanThreshold",
292292
"evaluation_periods": 1,
293293
"datapoints_to_alarm": 1,
294-
"treat_missing_data": None,
294+
"treat_missing_data": "missing",
295295
},
296296
"Mem": {
297297
"name": "clustername-HeadNode-Mem",
@@ -301,7 +301,7 @@ def test_add_alarms(mocker, config_file_name):
301301
"comparison_operator": "GreaterThanThreshold",
302302
"evaluation_periods": 1,
303303
"datapoints_to_alarm": 1,
304-
"treat_missing_data": None,
304+
"treat_missing_data": "missing",
305305
},
306306
"Disk": {
307307
"name": "clustername-HeadNode-Disk",
@@ -311,7 +311,17 @@ def test_add_alarms(mocker, config_file_name):
311311
"comparison_operator": "GreaterThanThreshold",
312312
"evaluation_periods": 1,
313313
"datapoints_to_alarm": 1,
314-
"treat_missing_data": None,
314+
"treat_missing_data": "missing",
315+
},
316+
"Clustermgtd-Heartbeat": {
317+
"name": "clustername-HeadNode-ClustermgtdHeartbeat",
318+
"metric_name": "ClustermgtdHeartbeat",
319+
"namespace": "ParallelCluster",
320+
"threshold": 1,
321+
"comparison_operator": "LessThanThreshold",
322+
"evaluation_periods": 10,
323+
"datapoints_to_alarm": 10,
324+
"treat_missing_data": "breaching",
315325
},
316326
}
317327

cli/tests/pcluster/templates/test_cw_dashboard_builder.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ def _verify_alarms(output_yaml, alarms_enabled):
101101
assert_that(output_yaml).contains("HeadNodeDiskAlarm")
102102
assert_that(output_yaml).contains("disk_used_percent")
103103

104+
assert_that(output_yaml).contains("HeadNodeClustermgtdHeartbeatAlarm")
105+
assert_that(output_yaml).contains("ClustermgtdHeartbeat")
106+
104107
else:
105108
assert_that(output_yaml).does_not_contain("Cluster Alarms")
106109
assert_that(output_yaml).does_not_contain("AWS::CloudWatch::Alarm")
@@ -139,6 +142,8 @@ def _verify_head_node_instance_metrics_graphs(output_yaml):
139142
assert_that(output_yaml).contains("Disk Read/Write Ops")
140143
assert_that(output_yaml).contains("Disk Used Percent")
141144
assert_that(output_yaml).contains("Memory Used Percent")
145+
assert_that(output_yaml).contains("Daemons Heartbeats")
146+
assert_that(output_yaml).contains("ClustermgtdHeartbeat")
142147

143148

144149
def _verify_ec2_metrics_conditions(cluster_config, output_yaml):

0 commit comments

Comments
 (0)