|
64 | 64 | CW_ALARM_PERIOD_DEFAULT, |
65 | 65 | CW_LOG_GROUP_NAME_PREFIX, |
66 | 66 | CW_LOGS_CFN_PARAM_NAME, |
| 67 | + CW_METRICS_CLUSTERMGTD_HEARTBEAT, |
| 68 | + CW_METRICS_DIMENSION_CLUSTER_NAME, |
| 69 | + CW_METRICS_NAMESPACE, |
67 | 70 | DEFAULT_EPHEMERAL_DIR, |
68 | 71 | EFS_PORT, |
69 | 72 | FSX_PORTS, |
@@ -365,37 +368,77 @@ def _add_head_node_alarms(self): |
365 | 368 | self.head_node_alarms = [] |
366 | 369 |
|
367 | 370 | metrics_for_alarms = { |
368 | | - "Health": self._cw_metric_head_node("AWS/EC2", "StatusCheckFailed"), |
369 | | - "Cpu": self._cw_metric_head_node("AWS/EC2", "CPUUtilization"), |
370 | | - "Mem": self._cw_metric_head_node("CWAgent", "mem_used_percent"), |
371 | | - "Disk": self._cw_metric_head_node("CWAgent", "disk_used_percent", extra_dimensions={"path": "/"}), |
| 371 | + "Health": { |
| 372 | + "metric": self._cw_metric_head_node("AWS/EC2", "StatusCheckFailed"), |
| 373 | + "evaluation_periods": CW_ALARM_EVALUATION_PERIODS_DEFAULT, |
| 374 | + "datapoints_to_alarm": CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT, |
| 375 | + "comparison_operator": cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, |
| 376 | + "threshold": 0, |
| 377 | + "treat_missing_data": cloudwatch.TreatMissingData.MISSING, |
| 378 | + }, |
| 379 | + "Cpu": { |
| 380 | + "metric": self._cw_metric_head_node("AWS/EC2", "CPUUtilization"), |
| 381 | + "evaluation_periods": CW_ALARM_EVALUATION_PERIODS_DEFAULT, |
| 382 | + "datapoints_to_alarm": CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT, |
| 383 | + "comparison_operator": cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, |
| 384 | + "threshold": CW_ALARM_PERCENT_THRESHOLD_DEFAULT, |
| 385 | + "treat_missing_data": cloudwatch.TreatMissingData.MISSING, |
| 386 | + }, |
| 387 | + "Mem": { |
| 388 | + "metric": self._cw_metric_head_node("CWAgent", "mem_used_percent"), |
| 389 | + "evaluation_periods": CW_ALARM_EVALUATION_PERIODS_DEFAULT, |
| 390 | + "datapoints_to_alarm": CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT, |
| 391 | + "comparison_operator": cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, |
| 392 | + "threshold": CW_ALARM_PERCENT_THRESHOLD_DEFAULT, |
| 393 | + "treat_missing_data": cloudwatch.TreatMissingData.MISSING, |
| 394 | + }, |
| 395 | + "Disk": { |
| 396 | + "metric": self._cw_metric_head_node("CWAgent", "disk_used_percent", extra_dimensions={"path": "/"}), |
| 397 | + "evaluation_periods": CW_ALARM_EVALUATION_PERIODS_DEFAULT, |
| 398 | + "datapoints_to_alarm": CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT, |
| 399 | + "comparison_operator": cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, |
| 400 | + "threshold": CW_ALARM_PERCENT_THRESHOLD_DEFAULT, |
| 401 | + "treat_missing_data": cloudwatch.TreatMissingData.MISSING, |
| 402 | + }, |
| 403 | + "ClustermgtdHeartbeat": { |
| 404 | + "metric": self._cw_metric_head_node( |
| 405 | + CW_METRICS_NAMESPACE, |
| 406 | + CW_METRICS_CLUSTERMGTD_HEARTBEAT, |
| 407 | + extra_dimensions={CW_METRICS_DIMENSION_CLUSTER_NAME: self.config.cluster_name}, |
| 408 | + ), |
| 409 | + "evaluation_periods": 10, |
| 410 | + "datapoints_to_alarm": 10, |
| 411 | + "comparison_operator": cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD, |
| 412 | + "threshold": 1, |
| 413 | + "treat_missing_data": cloudwatch.TreatMissingData.BREACHING, |
| 414 | + }, |
372 | 415 | } |
373 | 416 |
|
374 | | - for metric_key, metric in metrics_for_alarms.items(): |
| 417 | + for metric_key, alarm_details in metrics_for_alarms.items(): |
375 | 418 | alarm_id = f"HeadNode{metric_key}Alarm" |
376 | 419 | alarm_name = f"{self.stack.stack_name}-HeadNode-{metric_key}" |
377 | | - threshold = 0 if metric_key == "Health" else CW_ALARM_PERCENT_THRESHOLD_DEFAULT |
378 | | - self.head_node_alarms.append( |
379 | | - cloudwatch.Alarm( |
380 | | - scope=self.stack, |
381 | | - id=alarm_id, |
382 | | - alarm_name=alarm_name, |
383 | | - metric=metric, |
384 | | - evaluation_periods=CW_ALARM_EVALUATION_PERIODS_DEFAULT, |
385 | | - threshold=threshold, |
386 | | - comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, |
387 | | - datapoints_to_alarm=CW_ALARM_DATAPOINTS_TO_ALARM_DEFAULT, |
388 | | - ) |
389 | | - ) |
390 | | - |
391 | | - self.head_node_alarms.append( |
392 | | - cloudwatch.CompositeAlarm( |
| 420 | + alarm = cloudwatch.Alarm( |
393 | 421 | scope=self.stack, |
394 | | - id="HeadNodeAlarm", |
395 | | - composite_alarm_name=f"{self.stack.stack_name}-HeadNode", |
396 | | - alarm_rule=cloudwatch.AlarmRule.any_of(*self.head_node_alarms), |
| 422 | + id=alarm_id, |
| 423 | + alarm_name=alarm_name, |
| 424 | + metric=alarm_details["metric"], |
| 425 | + evaluation_periods=alarm_details["evaluation_periods"], |
| 426 | + threshold=alarm_details["threshold"], |
| 427 | + comparison_operator=alarm_details["comparison_operator"], |
| 428 | + datapoints_to_alarm=alarm_details["datapoints_to_alarm"], |
| 429 | + treat_missing_data=alarm_details["treat_missing_data"], |
397 | 430 | ) |
| 431 | + alarm.node.add_dependency(self.wait_condition) |
| 432 | + self.head_node_alarms.append(alarm) |
| 433 | + |
| 434 | + composite_alarm = cloudwatch.CompositeAlarm( |
| 435 | + scope=self.stack, |
| 436 | + id="HeadNodeAlarm", |
| 437 | + composite_alarm_name=f"{self.stack.stack_name}-HeadNode", |
| 438 | + alarm_rule=cloudwatch.AlarmRule.any_of(*self.head_node_alarms), |
398 | 439 | ) |
| 440 | + composite_alarm.node.add_dependency(self.wait_condition) |
| 441 | + self.head_node_alarms.append(composite_alarm) |
399 | 442 |
|
400 | 443 | def _add_iam_resources(self): |
401 | 444 | head_node_iam_resources = HeadNodeIamResources( |
|
0 commit comments