Skip to content

Commit 39a655e

Browse files
committed
[Test] Extend integ test test_monitoring to verify that the metric ClustermgtdHeartbeat is collected.
1 parent 2edb607 commit 39a655e

1 file changed

Lines changed: 68 additions & 34 deletions

File tree

tests/integration-tests/tests/monitoring/test_monitoring.py

Lines changed: 68 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def test_monitoring(
5454
# we only perform this test for one of the 3 test conditions
5555
# because this test could be time-consuming (we allow some retries to ensure we can get metrics data)
5656
if dashboard_enabled and cw_log_enabled:
57-
_test_cw_agent_metrics(cw_client, headnode_instance_id, compute_instance_ids[0])
57+
_test_metrics(cw_client, headnode_instance_id, compute_instance_ids[0], cluster.cfn_name, scheduler)
5858

5959
# test dashboard and alarms
6060
_test_dashboard(cw_client, cluster.cfn_name, region, dashboard_enabled, cw_log_enabled)
@@ -65,17 +65,25 @@ def test_monitoring(
6565

6666

6767
@retry(stop_max_attempt_number=8, wait_fixed=minutes(2))
68-
def _test_cw_agent_metrics(cw_client, headnode_instance_id, compute_instance_id):
68+
def _test_metrics(cw_client, headnode_instance_id, compute_instance_id, cluster_name, scheduler):
6969
# query for the past 20 minutes
7070
start_timestamp, end_timestamp = _get_start_end_timestamp(minutes=20)
7171

72-
# test memory and disk metrics are collected for the head node
73-
metrics_response_headnode = _get_metric_data(headnode_instance_id, cw_client, start_timestamp, end_timestamp)
72+
# test memory, disk, and clustermgtd heartbeat metrics are collected for the head node
73+
metrics_response_headnode = _get_metric_data(
74+
cw_client, start_timestamp, end_timestamp, instance_id=headnode_instance_id, cluster_name=cluster_name
75+
)
7476
mem_values = _get_metric_data_values(metrics_response_headnode, "mem")
7577
disk_values = _get_metric_data_values(metrics_response_headnode, "disk")
78+
clustermgtd_heartbeat_values = _get_metric_data_values(metrics_response_headnode, "clustermgtd_heartbeat")
7679
assert_that(mem_values).is_not_empty()
7780
assert_that(disk_values).is_not_empty()
7881

82+
if scheduler == "slurm":
83+
assert_that(clustermgtd_heartbeat_values).is_not_empty()
84+
else:
85+
assert_that(clustermgtd_heartbeat_values).is_empty()
86+
7987
# wait for additional 1 minute to reduce the chance of false negative result for compute nodes
8088
time.sleep(60)
8189
# test memory and disk metrics are not collected for compute nodes
@@ -87,6 +95,8 @@ def _test_cw_agent_metrics(cw_client, headnode_instance_id, compute_instance_id)
8795

8896

8997
def _test_dashboard(cw_client, cluster_name, region, dashboard_enabled, cw_log_enabled):
98+
# TODO: This assertion can be removed because the content of cluster dashboard is covered by unit tests.
99+
# At least let's not expand this assertion with more conditions.
90100
dashboard_name = "{0}-{1}".format(cluster_name, region)
91101
if dashboard_enabled:
92102
dashboard_response = cw_client.get_dashboard(DashboardName=dashboard_name)
@@ -108,6 +118,8 @@ def _test_dashboard(cw_client, cluster_name, region, dashboard_enabled, cw_log_e
108118

109119

110120
def _test_alarms(cw_client, cluster_name, headnode_instance_id, alarms_enabled):
121+
# TODO: This assertion can be removed because the settings of cluster alarms are covered by unit tests.
122+
# At least let's not expand this assertion with more conditions.
111123
alarm_response = cw_client.describe_alarms(AlarmNamePrefix=cluster_name)
112124
if alarms_enabled:
113125
health_alarm_name = f"{cluster_name}-HeadNode-Health"
@@ -144,49 +156,71 @@ def _get_start_end_timestamp(minutes):
144156
return start_timestamp, end_timestamp_ceil
145157

146158

147-
def _get_metric_data(instance_id, cw_client, start_timestamp, end_timestamp):
148-
metrics_response = cw_client.get_metric_data(
149-
MetricDataQueries=[
150-
{
151-
"Id": "mem",
152-
"MetricStat": {
153-
"Metric": {
154-
"Namespace": "CWAgent",
155-
"MetricName": "mem_used_percent",
156-
"Dimensions": [
157-
{
158-
"Name": "InstanceId",
159-
"Value": instance_id,
160-
}
161-
],
162-
},
163-
"Period": 60,
164-
"Stat": "Maximum",
159+
def _get_metric_data(cw_client, start_timestamp, end_timestamp, instance_id, cluster_name=None):
160+
"""
161+
Query CloudWatch metrics.
162+
163+
Args:
164+
cw_client: CloudWatch client
165+
start_timestamp: Start time for the query
166+
end_timestamp: End time for the query
167+
instance_id: EC2 instance ID for CWAgent metrics
168+
cluster_name: Cluster name for ParallelCluster metrics (optional)
169+
"""
170+
queries = [
171+
{
172+
"Id": "mem",
173+
"MetricStat": {
174+
"Metric": {
175+
"Namespace": "CWAgent",
176+
"MetricName": "mem_used_percent",
177+
"Dimensions": [{"Name": "InstanceId", "Value": instance_id}],
178+
},
179+
"Period": 60,
180+
"Stat": "Maximum",
181+
},
182+
},
183+
{
184+
"Id": "disk",
185+
"MetricStat": {
186+
"Metric": {
187+
"Namespace": "CWAgent",
188+
"MetricName": "disk_used_percent",
189+
"Dimensions": [
190+
{"Name": "InstanceId", "Value": instance_id},
191+
{"Name": "path", "Value": "/"},
192+
],
165193
},
194+
"Period": 60,
195+
"Stat": "Maximum",
166196
},
197+
},
198+
]
199+
200+
if cluster_name:
201+
queries.append(
167202
{
168-
"Id": "disk",
203+
"Id": "clustermgtd_heartbeat",
169204
"MetricStat": {
170205
"Metric": {
171-
"Namespace": "CWAgent",
172-
"MetricName": "disk_used_percent",
206+
"Namespace": "ParallelCluster",
207+
"MetricName": "ClustermgtdHeartbeat",
173208
"Dimensions": [
174-
{
175-
"Name": "InstanceId",
176-
"Value": instance_id,
177-
},
178-
{"Name": "path", "Value": "/"},
209+
{"Name": "ClusterName", "Value": cluster_name},
210+
{"Name": "InstanceId", "Value": instance_id},
179211
],
180212
},
181213
"Period": 60,
182-
"Stat": "Maximum",
214+
"Stat": "Sum",
183215
},
184-
},
185-
],
216+
}
217+
)
218+
219+
return cw_client.get_metric_data(
220+
MetricDataQueries=queries,
186221
StartTime=start_timestamp,
187222
EndTime=end_timestamp,
188223
)
189-
return metrics_response
190224

191225

192226
def _get_metric_data_values(response, query_id):

0 commit comments

Comments
 (0)