|
64 | 64 | CW_ALARM_PERIOD_DEFAULT, |
65 | 65 | CW_LOG_GROUP_NAME_PREFIX, |
66 | 66 | CW_LOGS_CFN_PARAM_NAME, |
| 67 | + CW_METRICS_CLUSTERMGTD_HEARTBEAT, |
| 68 | + CW_METRICS_DIMENSION_CLUSTER_NAME, |
| 69 | + CW_METRICS_NAMESPACE, |
67 | 70 | DEFAULT_EPHEMERAL_DIR, |
68 | 71 | EFS_PORT, |
69 | 72 | FSX_PORTS, |
@@ -364,38 +367,62 @@ def _cw_metric_head_node( |
364 | 367 | def _add_head_node_alarms(self): |
365 | 368 | self.head_node_alarms = [] |
366 | 369 |
|
| 370 | + # Metric-specific configurations (only specify overrides from defaults) |
367 | 371 | metrics_for_alarms = { |
368 | | - "Health": self._cw_metric_head_node("AWS/EC2", "StatusCheckFailed"), |
369 | | - "Cpu": self._cw_metric_head_node("AWS/EC2", "CPUUtilization"), |
370 | | - "Mem": self._cw_metric_head_node("CWAgent", "mem_used_percent"), |
371 | | - "Disk": self._cw_metric_head_node("CWAgent", "disk_used_percent", extra_dimensions={"path": "/"}), |
| 372 | + "Health": { |
| 373 | + "metric": self._cw_metric_head_node("AWS/EC2", "StatusCheckFailed"), |
| 374 | + "threshold": 0, |
| 375 | + }, |
| 376 | + "Cpu": { |
| 377 | + "metric": self._cw_metric_head_node("AWS/EC2", "CPUUtilization"), |
| 378 | + }, |
| 379 | + "Mem": { |
| 380 | + "metric": self._cw_metric_head_node("CWAgent", "mem_used_percent"), |
| 381 | + }, |
| 382 | + "Disk": { |
| 383 | + "metric": self._cw_metric_head_node("CWAgent", "disk_used_percent", extra_dimensions={"path": "/"}), |
| 384 | + }, |
| 385 | + "ClustermgtdHeartbeat": { |
| 386 | + "metric": self._cw_metric_head_node( |
| 387 | + CW_METRICS_NAMESPACE, |
| 388 | + CW_METRICS_CLUSTERMGTD_HEARTBEAT, |
| 389 | + extra_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name}, |
| 390 | + ), |
| 391 | + "evaluation_periods": 10, |
| 392 | + "datapoints_to_alarm": 10, |
| 393 | + "comparison_operator": cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD, |
| 394 | + "threshold": 1, |
| 395 | + "treat_missing_data": cloudwatch.TreatMissingData.BREACHING, |
| 396 | + }, |
372 | 397 | } |
373 | 398 |
|
374 | | - for metric_key, metric in metrics_for_alarms.items(): |
| 399 | + for metric_key, alarm_config in metrics_for_alarms.items(): |
375 | 400 | alarm_id = f"HeadNode{metric_key}Alarm" |
376 | 401 | alarm_name = f"{self.stack.stack_name}-HeadNode-{metric_key}" |
377 | | - threshold = 0 if metric_key == "Health" else CW_ALARM_PERCENT_THRESHOLD_DEFAULT |
378 | | - self.head_node_alarms.append( |
379 | | - cloudwatch.Alarm( |
380 | | - scope=self.stack, |
381 | | - id=alarm_id, |
382 | | - alarm_name=alarm_name, |
383 | | - metric=metric, |
384 | | - evaluation_periods=CW_ALARM_EVALUATION_PERIODS_DEFAULT, |
385 | | - threshold=threshold, |
386 | | - comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, |
387 | | - datapoints_to_alarm=CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT, |
388 | | - ) |
389 | | - ) |
390 | | - |
391 | | - self.head_node_alarms.append( |
392 | | - cloudwatch.CompositeAlarm( |
| 402 | + alarm = cloudwatch.Alarm( |
393 | 403 | scope=self.stack, |
394 | | - id="HeadNodeAlarm", |
395 | | - composite_alarm_name=f"{self.stack.stack_name}-HeadNode", |
396 | | - alarm_rule=cloudwatch.AlarmRule.any_of(*self.head_node_alarms), |
| 404 | + id=alarm_id, |
| 405 | + alarm_name=alarm_name, |
| 406 | + metric=alarm_config["metric"], |
| 407 | + evaluation_periods=alarm_config.get("evaluation_periods", CW_ALARM_EVALUATION_PERIODS_DEFAULT), |
| 408 | + threshold=alarm_config.get("threshold", CW_ALARM_PERCENT_THRESHOLD_DEFAULT), |
| 409 | + comparison_operator=alarm_config.get( |
| 410 | + "comparison_operator", cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD |
| 411 | + ), |
| 412 | + datapoints_to_alarm=alarm_config.get("datapoints_to_alarm", CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT), |
| 413 | + treat_missing_data=alarm_config.get("treat_missing_data", cloudwatch.TreatMissingData.MISSING), |
397 | 414 | ) |
| 415 | + alarm.node.add_dependency(self.wait_condition) |
| 416 | + self.head_node_alarms.append(alarm) |
| 417 | + |
| 418 | + composite_alarm = cloudwatch.CompositeAlarm( |
| 419 | + scope=self.stack, |
| 420 | + id="HeadNodeAlarm", |
| 421 | + composite_alarm_name=f"{self.stack.stack_name}-HeadNode", |
| 422 | + alarm_rule=cloudwatch.AlarmRule.any_of(*self.head_node_alarms), |
398 | 423 | ) |
| 424 | + composite_alarm.node.add_dependency(self.wait_condition) |
| 425 | + self.head_node_alarms.append(composite_alarm) |
399 | 426 |
|
400 | 427 | def _add_iam_resources(self): |
401 | 428 | head_node_iam_resources = HeadNodeIamResources( |
|
0 commit comments