Skip to content

Commit 46dde5f

Browse files
committed
[Test] Extend integ test test_monitoring to verify that the metric ClustermgtdHeartbeat is collected.
1 parent 2edb607 commit 46dde5f

1 file changed

Lines changed: 78 additions & 36 deletions

File tree

tests/integration-tests/tests/monitoring/test_monitoring.py

Lines changed: 78 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
1111
# See the License for the specific language governing permissions and limitations under the License.
1212
import datetime
13+
import logging
1314
import math
1415
import time
1516

@@ -33,6 +34,7 @@ def test_monitoring(
3334
cw_log_enabled,
3435
alarms_enabled,
3536
region,
37+
scheduler,
3638
pcluster_config_reader,
3739
clusters_factory,
3840
test_datadir,
@@ -48,13 +50,14 @@ def test_monitoring(
4850
headnode_instance_id = cluster.get_cluster_instance_ids(node_type="HeadNode")[0]
4951
compute_instance_ids = cluster.get_cluster_instance_ids(node_type="Compute")
5052
# the MinCount is set to 1, so we should have at least one compute node
53+
logging.info(f"Retrieved compute nodes: {compute_instance_ids}")
5154
assert_that(compute_instance_ids).is_not_empty()
5255

5356
# test CWAgent metrics
5457
# we only perform this test for one of the 3 test conditions
5558
# because this test could be time-consuming (we allow some retries to ensure we can get metrics data)
5659
if dashboard_enabled and cw_log_enabled:
57-
_test_cw_agent_metrics(cw_client, headnode_instance_id, compute_instance_ids[0])
60+
_test_metrics(cw_client, headnode_instance_id, compute_instance_ids[0], cluster.cfn_name, scheduler)
5861

5962
# test dashboard and alarms
6063
_test_dashboard(cw_client, cluster.cfn_name, region, dashboard_enabled, cw_log_enabled)
@@ -65,28 +68,43 @@ def test_monitoring(
6568

6669

6770
@retry(stop_max_attempt_number=8, wait_fixed=minutes(2))
68-
def _test_cw_agent_metrics(cw_client, headnode_instance_id, compute_instance_id):
71+
def _test_metrics(cw_client, headnode_instance_id, compute_instance_id, cluster_name, scheduler):
6972
# query for the past 20 minutes
7073
start_timestamp, end_timestamp = _get_start_end_timestamp(minutes=20)
7174

72-
# test memory and disk metrics are collected for the head node
73-
metrics_response_headnode = _get_metric_data(headnode_instance_id, cw_client, start_timestamp, end_timestamp)
75+
# test memory, disk, and clustermgtd heartbeat metrics are collected for the head node
76+
logging.info(f"Retrieving head node metrics from {start_timestamp} to {end_timestamp}")
77+
metrics_response_headnode = _get_metric_data(
78+
cw_client, start_timestamp, end_timestamp, instance_id=headnode_instance_id, cluster_name=cluster_name
79+
)
80+
logging.info(f"Head node metrics retrieved for the head node")
7481
mem_values = _get_metric_data_values(metrics_response_headnode, "mem")
7582
disk_values = _get_metric_data_values(metrics_response_headnode, "disk")
83+
clustermgtd_heartbeat_values = _get_metric_data_values(metrics_response_headnode, "clustermgtd_heartbeat")
7684
assert_that(mem_values).is_not_empty()
7785
assert_that(disk_values).is_not_empty()
7886

87+
if scheduler == "slurm":
88+
assert_that(clustermgtd_heartbeat_values).is_not_empty()
89+
else:
90+
assert_that(clustermgtd_heartbeat_values).is_empty()
91+
7992
# wait for additional 1 minute to reduce the chance of false negative result for compute nodes
80-
time.sleep(60)
93+
sleep_seconds = 60
94+
logging.info(f"Waiting {sleep_seconds} seconds for compute node metrics")
95+
time.sleep(sleep_seconds)
96+
8197
# test memory and disk metrics are not collected for compute nodes
82-
metrics_response_compute = _get_metric_data(compute_instance_id, cw_client, start_timestamp, end_timestamp)
98+
metrics_response_compute = _get_metric_data(cw_client, start_timestamp, end_timestamp, compute_instance_id)
8399
mem_values = _get_metric_data_values(metrics_response_compute, "mem")
84100
disk_values = _get_metric_data_values(metrics_response_compute, "disk")
85101
assert_that(mem_values).is_empty()
86102
assert_that(disk_values).is_empty()
87103

88104

89105
def _test_dashboard(cw_client, cluster_name, region, dashboard_enabled, cw_log_enabled):
106+
# TODO: This assertion can be removed because the content of cluster dashboard is covered by unit tests.
107+
# At least let's not expand this assertion with more conditions.
90108
dashboard_name = "{0}-{1}".format(cluster_name, region)
91109
if dashboard_enabled:
92110
dashboard_response = cw_client.get_dashboard(DashboardName=dashboard_name)
@@ -108,6 +126,8 @@ def _test_dashboard(cw_client, cluster_name, region, dashboard_enabled, cw_log_e
108126

109127

110128
def _test_alarms(cw_client, cluster_name, headnode_instance_id, alarms_enabled):
129+
# TODO: This assertion can be removed because the settings of cluster alarms are covered by unit tests.
130+
# At least let's not expand this assertion with more conditions.
111131
alarm_response = cw_client.describe_alarms(AlarmNamePrefix=cluster_name)
112132
if alarms_enabled:
113133
health_alarm_name = f"{cluster_name}-HeadNode-Health"
@@ -144,49 +164,71 @@ def _get_start_end_timestamp(minutes):
144164
return start_timestamp, end_timestamp_ceil
145165

146166

147-
def _get_metric_data(instance_id, cw_client, start_timestamp, end_timestamp):
148-
metrics_response = cw_client.get_metric_data(
149-
MetricDataQueries=[
150-
{
151-
"Id": "mem",
152-
"MetricStat": {
153-
"Metric": {
154-
"Namespace": "CWAgent",
155-
"MetricName": "mem_used_percent",
156-
"Dimensions": [
157-
{
158-
"Name": "InstanceId",
159-
"Value": instance_id,
160-
}
161-
],
162-
},
163-
"Period": 60,
164-
"Stat": "Maximum",
167+
def _get_metric_data(cw_client, start_timestamp, end_timestamp, instance_id, cluster_name=None):
168+
"""
169+
Query CloudWatch metrics.
170+
171+
Args:
172+
cw_client: CloudWatch client
173+
start_timestamp: Start time for the query
174+
end_timestamp: End time for the query
175+
instance_id: EC2 instance ID for CWAgent metrics
176+
cluster_name: Cluster name for ParallelCluster metrics (optional)
177+
"""
178+
queries = [
179+
{
180+
"Id": "mem",
181+
"MetricStat": {
182+
"Metric": {
183+
"Namespace": "CWAgent",
184+
"MetricName": "mem_used_percent",
185+
"Dimensions": [{"Name": "InstanceId", "Value": instance_id}],
186+
},
187+
"Period": 60,
188+
"Stat": "Maximum",
189+
},
190+
},
191+
{
192+
"Id": "disk",
193+
"MetricStat": {
194+
"Metric": {
195+
"Namespace": "CWAgent",
196+
"MetricName": "disk_used_percent",
197+
"Dimensions": [
198+
{"Name": "InstanceId", "Value": instance_id},
199+
{"Name": "path", "Value": "/"},
200+
],
165201
},
202+
"Period": 60,
203+
"Stat": "Maximum",
166204
},
205+
},
206+
]
207+
208+
if cluster_name:
209+
queries.append(
167210
{
168-
"Id": "disk",
211+
"Id": "clustermgtd_heartbeat",
169212
"MetricStat": {
170213
"Metric": {
171-
"Namespace": "CWAgent",
172-
"MetricName": "disk_used_percent",
214+
"Namespace": "ParallelCluster",
215+
"MetricName": "ClustermgtdHeartbeat",
173216
"Dimensions": [
174-
{
175-
"Name": "InstanceId",
176-
"Value": instance_id,
177-
},
178-
{"Name": "path", "Value": "/"},
217+
{"Name": "ClusterName", "Value": cluster_name},
218+
{"Name": "InstanceId", "Value": instance_id},
179219
],
180220
},
181221
"Period": 60,
182-
"Stat": "Maximum",
222+
"Stat": "Sum",
183223
},
184-
},
185-
],
224+
}
225+
)
226+
227+
return cw_client.get_metric_data(
228+
MetricDataQueries=queries,
186229
StartTime=start_timestamp,
187230
EndTime=end_timestamp,
188231
)
189-
return metrics_response
190232

191233

192234
def _get_metric_data_values(response, query_id):

0 commit comments

Comments
 (0)