Skip to content

Commit 94cdfe5

Browse files
committed
[Test] Extend integ test test_update_rollback_failure to validate that clustermgtd gets restarted when the async flow of update-compute-fleet fails.
1 parent 98074f5 commit 94cdfe5

1 file changed

Lines changed: 61 additions & 2 deletions

File tree

tests/integration-tests/tests/update/test_update_rollback_failure.py

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,14 @@
2020
from retrying import retry
2121
from time_utils import minutes, seconds
2222
from utils import (
23+
check_status,
2324
get_compute_nodes_instance_ids,
2425
get_file_mtime_age_seconds,
2526
match_regex_in_log,
27+
wait_for_computefleet_changed,
2628
)
2729

30+
from tests.common.assertions import wait_for_num_instances_in_cluster
2831
from tests.common.schedulers_common import SlurmCommands
2932

3033
logger = logging.getLogger(__name__)
@@ -175,6 +178,27 @@ def test_update_rollback_failure(
175178
# region, cluster.name, cn3_instance_id, initial_config_version
176179
# )
177180

181+
# Inject failure in the async workflow of update-compute-fleet.
182+
# This will cause the recipe update_compute_fleet_status to fail when clustermgtd is stopped.
183+
# The goal is to verify that we restart clustermgtd on failure and that the functionality is able to resume
184+
# after the failure goes away.
185+
logger.info("Injecting failure on compute-fleet status update...")
186+
_inject_slurm_fleet_status_manager_failure(remote_command_executor)
187+
cluster.stop()
188+
189+
_wait_for_slurm_fleet_status_manager_failure(remote_command_executor)
190+
check_status(cluster, compute_fleet_status="STOPPING")
191+
retry(wait_fixed=seconds(5), stop_max_delay=seconds(20))(_verify_clustermgtd_running)(remote_command_executor)
192+
193+
# After removing the injected failure, we expect the compute-fleet change functionality to resume, that is:
194+
# 1. We expect the compute fleet to reach the stopped state, where no compute nodes are running.
195+
# 2. We expect to be able to restart the fleet
196+
_restore_slurm_fleet_status_manager(remote_command_executor)
197+
wait_for_computefleet_changed(cluster, "STOPPED")
198+
wait_for_num_instances_in_cluster(cluster.name, region, 0)
199+
cluster.start()
200+
wait_for_computefleet_changed(cluster, "RUNNING")
201+
_wait_for_static_nodes_ready(slurm_commands, expected_count=N_STATIC_NODES)
178202
logger.info("All verifications passed!")
179203

180204

@@ -298,11 +322,33 @@ def _inject_slurmctld_restart_failure(remote_command_executor):
298322

299323
def _restore_slurmctld(remote_command_executor):
300324
"""Restore execute permission on slurmctld."""
301-
remote_command_executor.run_remote_command("sudo chmod +x /opt/slurm/sbin/slurmctld;")
325+
remote_command_executor.run_remote_command("sudo chmod +x /opt/slurm/sbin/slurmctld")
302326
remote_command_executor.run_remote_command("sudo systemctl restart slurmctld")
303327
logger.info("slurmctld restored")
304328

305329

330+
def _inject_slurm_fleet_status_manager_failure(remote_command_executor):
331+
"""
332+
Inject failure into the async path of update-compute-fleet by removing execute permission
333+
from slurm_fleet_status_manager.
334+
335+
When the update_compute_fleet_status recipe tries to run slurm_fleet_status_manager, it will fail
336+
because the script is not executable.
337+
"""
338+
remote_command_executor.run_remote_command(
339+
"sudo chmod -x /opt/parallelcluster/scripts/slurm/slurm_fleet_status_manager"
340+
)
341+
logger.info("slurm_fleet_status_manager made non-executable - update-compute-fleet async path will fail")
342+
343+
344+
def _restore_slurm_fleet_status_manager(remote_command_executor):
345+
"""Restore execute permission on slurm_fleet_status_manager."""
346+
remote_command_executor.run_remote_command(
347+
"sudo chmod +x /opt/parallelcluster/scripts/slurm/slurm_fleet_status_manager"
348+
)
349+
logger.info("slurm_fleet_status_manager restored")
350+
351+
306352
def _disable_check_update_timer_on_compute_node(remote_command_executor, node_name):
307353
"""
308354
Disable pcluster-check-update on a compute node using srun.
@@ -354,7 +400,20 @@ def _wait_for_rollback_failure(rce: RemoteCommandExecutor):
354400
r"ShellCommandFailed: execute\[check slurmctld status\] \(aws-parallelcluster-slurm::update_head_node",
355401
)
356402
if not match:
357-
raise Exception(f"Update recipe never reached the cluster readiness checks. Last lines: {lines}")
403+
raise Exception(f"Update recipe never reached the slurmctld status check failure. Last lines: {lines}")
404+
logger.info(f"Update recipe reached the expected failure: {lines}")
405+
406+
407+
@retry(wait_fixed=seconds(15), stop_max_delay=minutes(5))
408+
def _wait_for_slurm_fleet_status_manager_failure(rce: RemoteCommandExecutor):
409+
match, lines = match_regex_in_log(
410+
rce,
411+
"/var/log/chef-client.log",
412+
r"ShellCommandFailed: bash\[update compute fleet\] "
413+
r"\(aws-parallelcluster-slurm::update_computefleet_status_head_node",
414+
)
415+
if not match:
416+
raise Exception(f"Update recipe never reached the failure on compute fleet script failure. Last lines: {lines}")
358417
logger.info(f"Update recipe reached the expected failure: {lines}")
359418

360419

0 commit comments

Comments
 (0)