1010# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
1111# See the License for the specific language governing permissions and limitations under the License.
1212import datetime
13+ import logging
1314import math
1415import time
1516
@@ -33,6 +34,7 @@ def test_monitoring(
3334 cw_log_enabled ,
3435 alarms_enabled ,
3536 region ,
37+ scheduler ,
3638 pcluster_config_reader ,
3739 clusters_factory ,
3840 test_datadir ,
@@ -48,13 +50,14 @@ def test_monitoring(
4850 headnode_instance_id = cluster .get_cluster_instance_ids (node_type = "HeadNode" )[0 ]
4951 compute_instance_ids = cluster .get_cluster_instance_ids (node_type = "Compute" )
5052 # the MinCount is set to 1, so we should have at least one compute node
53+ logging .info (f"Retrieved compute nodes: { compute_instance_ids } " )
5154 assert_that (compute_instance_ids ).is_not_empty ()
5255
5356 # test CWAgent metrics
5457 # we only perform this test for one of the 3 test conditions
5558 # because this test could be time-consuming (we allow some retries to ensure we can get metrics data)
5659 if dashboard_enabled and cw_log_enabled :
57- _test_cw_agent_metrics (cw_client , headnode_instance_id , compute_instance_ids [0 ])
60+ _test_metrics (cw_client , headnode_instance_id , compute_instance_ids [0 ], cluster . cfn_name , scheduler )
5861
5962 # test dashboard and alarms
6063 _test_dashboard (cw_client , cluster .cfn_name , region , dashboard_enabled , cw_log_enabled )
@@ -65,28 +68,43 @@ def test_monitoring(
6568
6669
6770@retry (stop_max_attempt_number = 8 , wait_fixed = minutes (2 ))
68- def _test_cw_agent_metrics (cw_client , headnode_instance_id , compute_instance_id ):
71+ def _test_metrics (cw_client , headnode_instance_id , compute_instance_id , cluster_name , scheduler ):
6972 # query for the past 20 minutes
7073 start_timestamp , end_timestamp = _get_start_end_timestamp (minutes = 20 )
7174
72- # test memory and disk metrics are collected for the head node
73- metrics_response_headnode = _get_metric_data (headnode_instance_id , cw_client , start_timestamp , end_timestamp )
75+ # test memory, disk, and clustermgtd heartbeat metrics are collected for the head node
76+ logging .info (f"Retrieving head node metrics from { start_timestamp } to { end_timestamp } " )
77+ metrics_response_headnode = _get_metric_data (
78+ cw_client , start_timestamp , end_timestamp , instance_id = headnode_instance_id , cluster_name = cluster_name
79+ )
80+ logging .info (f"Head node metrics retrieved for the head node" )
7481 mem_values = _get_metric_data_values (metrics_response_headnode , "mem" )
7582 disk_values = _get_metric_data_values (metrics_response_headnode , "disk" )
83+ clustermgtd_heartbeat_values = _get_metric_data_values (metrics_response_headnode , "clustermgtd_heartbeat" )
7684 assert_that (mem_values ).is_not_empty ()
7785 assert_that (disk_values ).is_not_empty ()
7886
87+ if scheduler == "slurm" :
88+ assert_that (clustermgtd_heartbeat_values ).is_not_empty ()
89+ else :
90+ assert_that (clustermgtd_heartbeat_values ).is_empty ()
91+
7992 # wait for additional 1 minute to reduce the chance of false negative result for compute nodes
80- time .sleep (60 )
93+ sleep_seconds = 60
94+ logging .info (f"Waiting { sleep_seconds } seconds for compute node metrics" )
95+ time .sleep (sleep_seconds )
96+
8197 # test memory and disk metrics are not collected for compute nodes
82- metrics_response_compute = _get_metric_data (compute_instance_id , cw_client , start_timestamp , end_timestamp )
98+ metrics_response_compute = _get_metric_data (cw_client , start_timestamp , end_timestamp , compute_instance_id )
8399 mem_values = _get_metric_data_values (metrics_response_compute , "mem" )
84100 disk_values = _get_metric_data_values (metrics_response_compute , "disk" )
85101 assert_that (mem_values ).is_empty ()
86102 assert_that (disk_values ).is_empty ()
87103
88104
89105def _test_dashboard (cw_client , cluster_name , region , dashboard_enabled , cw_log_enabled ):
106+ # TODO: This assertion can be removed because the content of cluster dashboard is covered by unit tests.
107+ # At least let's not expand this assertion with more conditions.
90108 dashboard_name = "{0}-{1}" .format (cluster_name , region )
91109 if dashboard_enabled :
92110 dashboard_response = cw_client .get_dashboard (DashboardName = dashboard_name )
@@ -108,6 +126,8 @@ def _test_dashboard(cw_client, cluster_name, region, dashboard_enabled, cw_log_e
108126
109127
110128def _test_alarms (cw_client , cluster_name , headnode_instance_id , alarms_enabled ):
129+ # TODO: This assertion can be removed because the settings of cluster alarms are covered by unit tests.
130+ # At least let's not expand this assertion with more conditions.
111131 alarm_response = cw_client .describe_alarms (AlarmNamePrefix = cluster_name )
112132 if alarms_enabled :
113133 health_alarm_name = f"{ cluster_name } -HeadNode-Health"
@@ -144,49 +164,71 @@ def _get_start_end_timestamp(minutes):
144164 return start_timestamp , end_timestamp_ceil
145165
146166
147- def _get_metric_data (instance_id , cw_client , start_timestamp , end_timestamp ):
148- metrics_response = cw_client .get_metric_data (
149- MetricDataQueries = [
150- {
151- "Id" : "mem" ,
152- "MetricStat" : {
153- "Metric" : {
154- "Namespace" : "CWAgent" ,
155- "MetricName" : "mem_used_percent" ,
156- "Dimensions" : [
157- {
158- "Name" : "InstanceId" ,
159- "Value" : instance_id ,
160- }
161- ],
162- },
163- "Period" : 60 ,
164- "Stat" : "Maximum" ,
167+ def _get_metric_data (cw_client , start_timestamp , end_timestamp , instance_id , cluster_name = None ):
168+ """
169+ Query CloudWatch metrics.
170+
171+ Args:
172+ cw_client: CloudWatch client
173+ start_timestamp: Start time for the query
174+ end_timestamp: End time for the query
175+ instance_id: EC2 instance ID for CWAgent metrics
176+ cluster_name: Cluster name for ParallelCluster metrics (optional)
177+ """
178+ queries = [
179+ {
180+ "Id" : "mem" ,
181+ "MetricStat" : {
182+ "Metric" : {
183+ "Namespace" : "CWAgent" ,
184+ "MetricName" : "mem_used_percent" ,
185+ "Dimensions" : [{"Name" : "InstanceId" , "Value" : instance_id }],
186+ },
187+ "Period" : 60 ,
188+ "Stat" : "Maximum" ,
189+ },
190+ },
191+ {
192+ "Id" : "disk" ,
193+ "MetricStat" : {
194+ "Metric" : {
195+ "Namespace" : "CWAgent" ,
196+ "MetricName" : "disk_used_percent" ,
197+ "Dimensions" : [
198+ {"Name" : "InstanceId" , "Value" : instance_id },
199+ {"Name" : "path" , "Value" : "/" },
200+ ],
165201 },
202+ "Period" : 60 ,
203+ "Stat" : "Maximum" ,
166204 },
205+ },
206+ ]
207+
208+ if cluster_name :
209+ queries .append (
167210 {
168- "Id" : "disk " ,
211+ "Id" : "clustermgtd_heartbeat " ,
169212 "MetricStat" : {
170213 "Metric" : {
171- "Namespace" : "CWAgent " ,
172- "MetricName" : "disk_used_percent " ,
214+ "Namespace" : "ParallelCluster " ,
215+ "MetricName" : "ClustermgtdHeartbeat " ,
173216 "Dimensions" : [
174- {
175- "Name" : "InstanceId" ,
176- "Value" : instance_id ,
177- },
178- {"Name" : "path" , "Value" : "/" },
217+ {"Name" : "ClusterName" , "Value" : cluster_name },
218+ {"Name" : "InstanceId" , "Value" : instance_id },
179219 ],
180220 },
181221 "Period" : 60 ,
182- "Stat" : "Maximum " ,
222+ "Stat" : "Sum " ,
183223 },
184- },
185- ],
224+ }
225+ )
226+
227+ return cw_client .get_metric_data (
228+ MetricDataQueries = queries ,
186229 StartTime = start_timestamp ,
187230 EndTime = end_timestamp ,
188231 )
189- return metrics_response
190232
191233
192234def _get_metric_data_values (response , query_id ):
0 commit comments