|
20 | 20 | from retrying import retry |
21 | 21 | from time_utils import minutes, seconds |
22 | 22 | from utils import ( |
| 23 | + check_status, |
23 | 24 | get_compute_nodes_instance_ids, |
24 | 25 | get_file_mtime_age_seconds, |
25 | 26 | match_regex_in_log, |
| 27 | + wait_for_computefleet_changed, |
26 | 28 | ) |
27 | 29 |
|
| 30 | +from tests.common.assertions import wait_for_num_instances_in_cluster |
28 | 31 | from tests.common.schedulers_common import SlurmCommands |
29 | 32 |
|
30 | 33 | logger = logging.getLogger(__name__) |
@@ -175,6 +178,27 @@ def test_update_rollback_failure( |
175 | 178 | # region, cluster.name, cn3_instance_id, initial_config_version |
176 | 179 | # ) |
177 | 180 |
|
| 181 | + # Inject failure in the async workflow of update-compute-fleet. |
| 182 | + # This will cause the recipe update_compute_fleet_status to fail when clustermgtd is stopped. |
| 183 | + # The goal is to verify that we restart clustermgtd on failure and that the functionality is able to resume |
| 184 | + # after the failure goes away. |
| 185 | + logger.info("Injecting failure on compute-fleet status update...") |
| 186 | + _inject_slurm_fleet_status_manager_failure(remote_command_executor) |
| 187 | + cluster.stop() |
| 188 | + |
| 189 | + _wait_for_slurm_fleet_status_manager_failure(remote_command_executor) |
| 190 | + check_status(cluster, compute_fleet_status="STOPPING") |
| 191 | + retry(wait_fixed=seconds(5), stop_max_delay=seconds(20))(_verify_clustermgtd_running)(remote_command_executor) |
| 192 | + |
| 193 | + # After removing the injected failure, we expect the compute-fleet change functionality to resume, that is: |
| 194 | + # 1. We expect the compute fleet to reach the stopped state, where no compute nodes are running. |
| 195 | + # 2. We expect to be able to restart the fleet |
| 196 | + _restore_slurm_fleet_status_manager(remote_command_executor) |
| 197 | + wait_for_computefleet_changed(cluster, "STOPPED") |
| 198 | + wait_for_num_instances_in_cluster(cluster.name, region, 0) |
| 199 | + cluster.start() |
| 200 | + wait_for_computefleet_changed(cluster, "RUNNING") |
| 201 | + _wait_for_static_nodes_ready(slurm_commands, expected_count=N_STATIC_NODES) |
178 | 202 | logger.info("All verifications passed!") |
179 | 203 |
|
180 | 204 |
|
@@ -298,11 +322,33 @@ def _inject_slurmctld_restart_failure(remote_command_executor): |
298 | 322 |
|
299 | 323 | def _restore_slurmctld(remote_command_executor): |
300 | 324 | """Restore execute permission on slurmctld.""" |
301 | | - remote_command_executor.run_remote_command("sudo chmod +x /opt/slurm/sbin/slurmctld;") |
| 325 | + remote_command_executor.run_remote_command("sudo chmod +x /opt/slurm/sbin/slurmctld") |
302 | 326 | remote_command_executor.run_remote_command("sudo systemctl restart slurmctld") |
303 | 327 | logger.info("slurmctld restored") |
304 | 328 |
|
305 | 329 |
|
| 330 | +def _inject_slurm_fleet_status_manager_failure(remote_command_executor): |
| 331 | + """ |
| 332 | + Inject failure into the async path of update-compute-fleet by removing execute permission |
| 333 | + from slurm_fleet_status_manager. |
| 334 | +
|
| 335 | + When the update_compute_fleet_status recipe tries to run slurm_fleet_status_manager, it will fail |
| 336 | + because the script is not executable. |
| 337 | + """ |
| 338 | + remote_command_executor.run_remote_command( |
| 339 | + "sudo chmod -x /opt/parallelcluster/scripts/slurm/slurm_fleet_status_manager" |
| 340 | + ) |
| 341 | + logger.info("slurm_fleet_status_manager made non-executable - update-compute-fleet async path will fail") |
| 342 | + |
| 343 | + |
| 344 | +def _restore_slurm_fleet_status_manager(remote_command_executor): |
| 345 | + """Restore execute permission on slurm_fleet_status_manager.""" |
| 346 | + remote_command_executor.run_remote_command( |
| 347 | + "sudo chmod +x /opt/parallelcluster/scripts/slurm/slurm_fleet_status_manager" |
| 348 | + ) |
| 349 | + logger.info("slurm_fleet_status_manager restored") |
| 350 | + |
| 351 | + |
306 | 352 | def _disable_check_update_timer_on_compute_node(remote_command_executor, node_name): |
307 | 353 | """ |
308 | 354 | Disable pcluster-check-update on a compute node using srun. |
@@ -354,7 +400,20 @@ def _wait_for_rollback_failure(rce: RemoteCommandExecutor): |
354 | 400 | r"ShellCommandFailed: execute\[check slurmctld status\] \(aws-parallelcluster-slurm::update_head_node", |
355 | 401 | ) |
356 | 402 | if not match: |
357 | | - raise Exception(f"Update recipe never reached the cluster readiness checks. Last lines: {lines}") |
| 403 | + raise Exception(f"Update recipe never reached the slurmctld status check failure. Last lines: {lines}") |
| 404 | + logger.info(f"Update recipe reached the expected failure: {lines}") |
| 405 | + |
| 406 | + |
| 407 | +@retry(wait_fixed=seconds(15), stop_max_delay=minutes(5)) |
| 408 | +def _wait_for_slurm_fleet_status_manager_failure(rce: RemoteCommandExecutor): |
| 409 | + match, lines = match_regex_in_log( |
| 410 | + rce, |
| 411 | + "/var/log/chef-client.log", |
| 412 | + r"ShellCommandFailed: bash\[update compute fleet\] " |
| 413 | + r"\(aws-parallelcluster-slurm::update_computefleet_status_head_node", |
| 414 | + ) |
| 415 | + if not match: |
| 416 | + raise Exception(f"Update recipe never reached the failure on compute fleet script failure. Last lines: {lines}") |
358 | 417 | logger.info(f"Update recipe reached the expected failure: {lines}") |
359 | 418 |
|
360 | 419 |
|
|
0 commit comments