@@ -54,7 +54,7 @@ def test_monitoring(
5454 # we only perform this test for one of the 3 test conditions
5555 # because this test could be time-consuming (we allow some retries to ensure we can get metrics data)
5656 if dashboard_enabled and cw_log_enabled :
57- _test_cw_agent_metrics (cw_client , headnode_instance_id , compute_instance_ids [0 ])
57+ _test_metrics (cw_client , headnode_instance_id , compute_instance_ids [0 ], cluster . cfn_name , scheduler )
5858
5959 # test dashboard and alarms
6060 _test_dashboard (cw_client , cluster .cfn_name , region , dashboard_enabled , cw_log_enabled )
@@ -65,17 +65,25 @@ def test_monitoring(
6565
6666
6767@retry (stop_max_attempt_number = 8 , wait_fixed = minutes (2 ))
68- def _test_cw_agent_metrics (cw_client , headnode_instance_id , compute_instance_id ):
68+ def _test_metrics (cw_client , headnode_instance_id , compute_instance_id , cluster_name , scheduler ):
6969 # query for the past 20 minutes
7070 start_timestamp , end_timestamp = _get_start_end_timestamp (minutes = 20 )
7171
72- # test memory and disk metrics are collected for the head node
73- metrics_response_headnode = _get_metric_data (headnode_instance_id , cw_client , start_timestamp , end_timestamp )
72+ # test memory, disk, and clustermgtd heartbeat metrics are collected for the head node
73+ metrics_response_headnode = _get_metric_data (
74+ cw_client , start_timestamp , end_timestamp , instance_id = headnode_instance_id , cluster_name = cluster_name
75+ )
7476 mem_values = _get_metric_data_values (metrics_response_headnode , "mem" )
7577 disk_values = _get_metric_data_values (metrics_response_headnode , "disk" )
78+ clustermgtd_heartbeat_values = _get_metric_data_values (metrics_response_headnode , "clustermgtd_heartbeat" )
7679 assert_that (mem_values ).is_not_empty ()
7780 assert_that (disk_values ).is_not_empty ()
7881
82+ if scheduler == "slurm" :
83+ assert_that (clustermgtd_heartbeat_values ).is_not_empty ()
84+ else :
85+ assert_that (clustermgtd_heartbeat_values ).is_empty ()
86+
7987 # wait for additional 1 minute to reduce the chance of false negative result for compute nodes
8088 time .sleep (60 )
8189 # test memory and disk metrics are not collected for compute nodes
@@ -87,6 +95,8 @@ def _test_cw_agent_metrics(cw_client, headnode_instance_id, compute_instance_id)
8795
8896
8997def _test_dashboard (cw_client , cluster_name , region , dashboard_enabled , cw_log_enabled ):
98+ # TODO: This assertion can be removed because the content of cluster dashboard is covered by unit tests.
99+ # At least let's not expand this assertion with more conditions.
90100 dashboard_name = "{0}-{1}" .format (cluster_name , region )
91101 if dashboard_enabled :
92102 dashboard_response = cw_client .get_dashboard (DashboardName = dashboard_name )
@@ -108,6 +118,8 @@ def _test_dashboard(cw_client, cluster_name, region, dashboard_enabled, cw_log_e
108118
109119
110120def _test_alarms (cw_client , cluster_name , headnode_instance_id , alarms_enabled ):
121+ # TODO: This assertion can be removed because the settings of cluster alarms are covered by unit tests.
122+ # At least let's not expand this assertion with more conditions.
111123 alarm_response = cw_client .describe_alarms (AlarmNamePrefix = cluster_name )
112124 if alarms_enabled :
113125 health_alarm_name = f"{ cluster_name } -HeadNode-Health"
@@ -144,49 +156,71 @@ def _get_start_end_timestamp(minutes):
144156 return start_timestamp , end_timestamp_ceil
145157
146158
147- def _get_metric_data (instance_id , cw_client , start_timestamp , end_timestamp ):
148- metrics_response = cw_client .get_metric_data (
149- MetricDataQueries = [
150- {
151- "Id" : "mem" ,
152- "MetricStat" : {
153- "Metric" : {
154- "Namespace" : "CWAgent" ,
155- "MetricName" : "mem_used_percent" ,
156- "Dimensions" : [
157- {
158- "Name" : "InstanceId" ,
159- "Value" : instance_id ,
160- }
161- ],
162- },
163- "Period" : 60 ,
164- "Stat" : "Maximum" ,
159+ def _get_metric_data (cw_client , start_timestamp , end_timestamp , instance_id , cluster_name = None ):
160+ """
161+ Query CloudWatch metrics.
162+
163+ Args:
164+ cw_client: CloudWatch client
165+ start_timestamp: Start time for the query
166+ end_timestamp: End time for the query
167+ instance_id: EC2 instance ID for CWAgent metrics
168+ cluster_name: Cluster name for ParallelCluster metrics (optional)
169+ """
170+ queries = [
171+ {
172+ "Id" : "mem" ,
173+ "MetricStat" : {
174+ "Metric" : {
175+ "Namespace" : "CWAgent" ,
176+ "MetricName" : "mem_used_percent" ,
177+ "Dimensions" : [{"Name" : "InstanceId" , "Value" : instance_id }],
178+ },
179+ "Period" : 60 ,
180+ "Stat" : "Maximum" ,
181+ },
182+ },
183+ {
184+ "Id" : "disk" ,
185+ "MetricStat" : {
186+ "Metric" : {
187+ "Namespace" : "CWAgent" ,
188+ "MetricName" : "disk_used_percent" ,
189+ "Dimensions" : [
190+ {"Name" : "InstanceId" , "Value" : instance_id },
191+ {"Name" : "path" , "Value" : "/" },
192+ ],
165193 },
194+ "Period" : 60 ,
195+ "Stat" : "Maximum" ,
166196 },
197+ },
198+ ]
199+
200+ if cluster_name :
201+ queries .append (
167202 {
168- "Id" : "disk " ,
203+ "Id" : "clustermgtd_heartbeat " ,
169204 "MetricStat" : {
170205 "Metric" : {
171- "Namespace" : "CWAgent " ,
172- "MetricName" : "disk_used_percent " ,
206+ "Namespace" : "ParallelCluster " ,
207+ "MetricName" : "ClustermgtdHeartbeat " ,
173208 "Dimensions" : [
174- {
175- "Name" : "InstanceId" ,
176- "Value" : instance_id ,
177- },
178- {"Name" : "path" , "Value" : "/" },
209+ {"Name" : "ClusterName" , "Value" : cluster_name },
210+ {"Name" : "InstanceId" , "Value" : instance_id },
179211 ],
180212 },
181213 "Period" : 60 ,
182- "Stat" : "Maximum " ,
214+ "Stat" : "Sum " ,
183215 },
184- },
185- ],
216+ }
217+ )
218+
219+ return cw_client .get_metric_data (
220+ MetricDataQueries = queries ,
186221 StartTime = start_timestamp ,
187222 EndTime = end_timestamp ,
188223 )
189- return metrics_response
190224
191225
192226def _get_metric_data_values (response , query_id ):
0 commit comments