Skip to content

Commit c9401d5

Browse files
committed
[Observability] Add alarm on missing clustermgtd heartbeat.
1 parent 6a0cb21 commit c9401d5

3 files changed

Lines changed: 85 additions & 26 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ CHANGELOG
99
- Upgrade jmespath to ~=1.0 (from ~=0.10).
1010
- Upgrade tabulate to <=0.9.0 (from <=0.8.10).
1111
- Add permission `cloudwatch:PutMetricData` to the head node policy so that clustermgtd is able to emit metrics.
12+
- Add alarm on missing clustermgtd heartbeat.
1213

1314
**BUG FIXES**
1415
- Add validation to block updates that change tag order. Blocking such change prevents update failures.

cli/src/pcluster/templates/cluster_stack.py

Lines changed: 67 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@
6464
CW_ALARM_PERIOD_DEFAULT,
6565
CW_LOG_GROUP_NAME_PREFIX,
6666
CW_LOGS_CFN_PARAM_NAME,
67+
CW_METRICS_CLUSTERMGTD_HEARTBEAT,
68+
CW_METRICS_DIMENSION_CLUSTER_NAME,
69+
CW_METRICS_NAMESPACE,
6770
DEFAULT_EPHEMERAL_DIR,
6871
EFS_PORT,
6972
FSX_PORTS,
@@ -365,37 +368,77 @@ def _add_head_node_alarms(self):
365368
self.head_node_alarms = []
366369

367370
metrics_for_alarms = {
368-
"Health": self._cw_metric_head_node("AWS/EC2", "StatusCheckFailed"),
369-
"Cpu": self._cw_metric_head_node("AWS/EC2", "CPUUtilization"),
370-
"Mem": self._cw_metric_head_node("CWAgent", "mem_used_percent"),
371-
"Disk": self._cw_metric_head_node("CWAgent", "disk_used_percent", extra_dimensions={"path": "/"}),
371+
"Health": {
372+
"metric": self._cw_metric_head_node("AWS/EC2", "StatusCheckFailed"),
373+
"evaluation_periods": CW_ALARM_EVALUATION_PERIODS_DEFAULT,
374+
"datapoints_to_alarm": CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT,
375+
"comparison_operator": cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
376+
"threshold": 0,
377+
"treat_missing_data": cloudwatch.TreatMissingData.IGNORE,
378+
},
379+
"Cpu": {
380+
"metric": self._cw_metric_head_node("AWS/EC2", "CPUUtilization"),
381+
"evaluation_periods": CW_ALARM_EVALUATION_PERIODS_DEFAULT,
382+
"datapoints_to_alarm": CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT,
383+
"comparison_operator": cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
384+
"threshold": CW_ALARM_PERCENT_THRESHOLD_DEFAULT,
385+
"treat_missing_data": cloudwatch.TreatMissingData.IGNORE,
386+
},
387+
"Mem": {
388+
"metric": self._cw_metric_head_node("CWAgent", "mem_used_percent"),
389+
"evaluation_periods": CW_ALARM_EVALUATION_PERIODS_DEFAULT,
390+
"datapoints_to_alarm": CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT,
391+
"comparison_operator": cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
392+
"threshold": CW_ALARM_PERCENT_THRESHOLD_DEFAULT,
393+
"treat_missing_data": cloudwatch.TreatMissingData.IGNORE,
394+
},
395+
"Disk": {
396+
"metric": self._cw_metric_head_node("CWAgent", "disk_used_percent", extra_dimensions={"path": "/"}),
397+
"evaluation_periods": CW_ALARM_EVALUATION_PERIODS_DEFAULT,
398+
"datapoints_to_alarm": CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT,
399+
"comparison_operator": cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
400+
"threshold": CW_ALARM_PERCENT_THRESHOLD_DEFAULT,
401+
"treat_missing_data": cloudwatch.TreatMissingData.IGNORE,
402+
},
403+
"Clustermgtd": {
404+
"metric": self._cw_metric_head_node(
405+
CW_METRICS_NAMESPACE,
406+
CW_METRICS_CLUSTERMGTD_HEARTBEAT,
407+
extra_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name},
408+
),
409+
"evaluation_periods": 10,
410+
"datapoints_to_alarm": 10,
411+
"comparison_operator": cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD,
412+
"threshold": 1,
413+
"treat_missing_data": cloudwatch.TreatMissingData.BREACHING,
414+
},
372415
}
373416

374-
for metric_key, metric in metrics_for_alarms.items():
417+
for metric_key, alarm_details in metrics_for_alarms.items():
375418
alarm_id = f"HeadNode{metric_key}Alarm"
376419
alarm_name = f"{self.stack.stack_name}-HeadNode-{metric_key}"
377-
threshold = 0 if metric_key == "Health" else CW_ALARM_PERCENT_THRESHOLD_DEFAULT
378-
self.head_node_alarms.append(
379-
cloudwatch.Alarm(
380-
scope=self.stack,
381-
id=alarm_id,
382-
alarm_name=alarm_name,
383-
metric=metric,
384-
evaluation_periods=CW_ALARM_EVALUATION_PERIODS_DEFAULT,
385-
threshold=threshold,
386-
comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
387-
datapoints_to_alarm=CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT,
388-
)
389-
)
390-
391-
self.head_node_alarms.append(
392-
cloudwatch.CompositeAlarm(
420+
alarm = cloudwatch.Alarm(
393421
scope=self.stack,
394-
id="HeadNodeAlarm",
395-
composite_alarm_name=f"{self.stack.stack_name}-HeadNode",
396-
alarm_rule=cloudwatch.AlarmRule.any_of(*self.head_node_alarms),
422+
id=alarm_id,
423+
alarm_name=alarm_name,
424+
metric=alarm_details["metric"],
425+
evaluation_periods=alarm_details["evaluation_periods"],
426+
threshold=alarm_details["threshold"],
427+
comparison_operator=alarm_details["comparison_operator"],
428+
datapoints_to_alarm=alarm_details["datapoints_to_alarm"],
429+
treat_missing_data=alarm_details["treat_missing_data"],
397430
)
431+
alarm.node.add_dependency(self.wait_condition)
432+
self.head_node_alarms.append(alarm)
433+
434+
composite_alarm = cloudwatch.CompositeAlarm(
435+
scope=self.stack,
436+
id="HeadNodeAlarm",
437+
composite_alarm_name=f"{self.stack.stack_name}-HeadNode",
438+
alarm_rule=cloudwatch.AlarmRule.any_of(*self.head_node_alarms),
398439
)
440+
composite_alarm.node.add_dependency(self.wait_condition)
441+
self.head_node_alarms.append(composite_alarm)
399442

400443
def _add_iam_resources(self):
401444
head_node_iam_resources = HeadNodeIamResources(

cli/src/pcluster/templates/cw_dashboard_builder.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,12 @@
1919

2020
from pcluster.config.cluster_config import BaseClusterConfig, ExistingFileCache, SharedFsxLustre
2121
from pcluster.config.common import SharedStorageType
22-
from pcluster.constants import Feature
22+
from pcluster.constants import (
23+
CW_METRICS_CLUSTERMGTD_HEARTBEAT,
24+
CW_METRICS_DIMENSION_CLUSTER_NAME,
25+
CW_METRICS_NAMESPACE,
26+
Feature,
27+
)
2328
from pcluster.utils import is_feature_supported
2429

2530
MAX_WIDTH = 24
@@ -567,9 +572,19 @@ def _add_head_node_instance_metrics_graphs(self):
567572
new_pcluster_metric(title="Memory Used Percent", metrics=["mem_used_percent"], namespace="CWAgent"),
568573
]
569574

575+
# Custom Metrics
576+
pcluster_metrics = [
577+
new_pcluster_metric(
578+
title="Daemons Heartbeats",
579+
metrics=[CW_METRICS_CLUSTERMGTD_HEARTBEAT],
580+
namespace=CW_METRICS_NAMESPACE,
581+
additional_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name},
582+
),
583+
]
584+
570585
# Create graphs for EC2 metrics and CW Agent metrics and update coordinates
571586
widgets_list = []
572-
for metrics_param in ec2_metrics + cwagent_metrics:
587+
for metrics_param in ec2_metrics + cwagent_metrics + pcluster_metrics:
573588
metrics_list = self._generate_metrics_list(metrics_param)
574589
graph_widget = self._generate_graph_widget(metrics_param.title, metrics_list)
575590
widgets_list.append(graph_widget)

0 commit comments

Comments
 (0)