From 9393d784348d212957be5f6b72fb7b99c8f5c919 Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Mon, 18 May 2026 13:38:32 +0000 Subject: [PATCH] exec_ssh_cmd: only retry commands that failed due to SSH errors We used to retry all SSH commands that failed, even those that don't actually exist. We're still retrying commands that are expected to fail, wasting time unnecessarily during os morphing: coriolis.exception.CoriolisException: Command "readlink -en /dev/sda2" failed on host '172.17.0.3:22' with exit code: 1 2026-05-18 12:14:11.783 1667765 WARNING coriolis.osmorphing.osmount.base [-] Target not found for symlink: /dev/sda2. Original link path will be returned What makes things even worse is that the caller functions are often retried as well, having the "retry_on_error" decorator without any parameters. Commands that are expected to fail end up being retried 25 times. We'll update "exec_ssh_cmd" so that by default it will perform retries only in case of SSH errors. For convenience, we're exposing the retry helper arguments (number of retries, retry interval and terminal exceptions). In addition to that, we'll introduce a new argument for the list of retried exceptions. If the caller needs the actual commands to be retried as well, "SSHCommandFailed" may be added to the list of retried exceptions. --- coriolis/exception.py | 4 ++++ coriolis/tests/test_utils.py | 35 ++++++++++++++++++++----------- coriolis/utils.py | 40 +++++++++++++++++++++++++++++------- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/coriolis/exception.py b/coriolis/exception.py index 73f38488..d228a7e7 100644 --- a/coriolis/exception.py +++ b/coriolis/exception.py @@ -509,6 +509,10 @@ class MinionMachineCommandTimeout(CoriolisException): pass +class SSHCommandFailed(CoriolisException): + pass + + class SSHCommandNotFoundException(CoriolisException): pass diff --git a/coriolis/tests/test_utils.py b/coriolis/tests/test_utils.py index 2bfc8cbf..2ad18cd1 100644 --- a/coriolis/tests/test_utils.py +++ b/coriolis/tests/test_utils.py @@ -83,6 +83,26 @@ def test_retry_on_error_exception_keyboard_interrupt(self): self.assertEqual(self.mock_func.call_count, 1) + def test_retry_on_error_not_in_list_of_retried(self): + self.mock_func.side_effect = ValueError + + self.assertRaises(ValueError, utils.retry_on_error( + max_attempts=5, sleep_seconds=0, + terminal_exceptions=[], + retried_exceptions=(CoriolisTestException, ))(self.mock_func)) + + self.assertEqual(self.mock_func.call_count, 1) + + def test_retry_on_error_in_list_of_retried(self): + self.mock_func.side_effect = CoriolisTestException + + self.assertRaises(CoriolisTestException, utils.retry_on_error( + max_attempts=5, sleep_seconds=0, + terminal_exceptions=[], + retried_exceptions=(CoriolisTestException, ))(self.mock_func)) + + self.assertEqual(self.mock_func.call_count, 5) + def test_retry_on_error_terminal_exception(self): self.mock_func.side_effect = CoriolisTestException @@ -383,10 +403,7 @@ def test_exec_ssh_cmd_exception(self): self.mock_ssh.exec_command.return_value = (None, self.mock_stdout, self.mock_stdout) - original_exec_ssh_cmd = testutils.get_wrapped_function( - utils.exec_ssh_cmd) - - self.assertRaises(exception.CoriolisException, original_exec_ssh_cmd, + self.assertRaises(exception.SSHCommandFailed, utils.exec_ssh_cmd, self.mock_ssh, "command") self.mock_ssh.exec_command.assert_called_once_with( @@ -398,11 +415,8 @@ def test_exec_ssh_cmd_command_not_found_in_stdout(self): self.mock_ssh.exec_command.return_value = (None, self.mock_stdout, self.mock_stdout) - original_exec_ssh_cmd = testutils.get_wrapped_function( - utils.exec_ssh_cmd) - self.assertRaises(exception.SSHCommandNotFoundException, - original_exec_ssh_cmd, self.mock_ssh, "command") + utils.exec_ssh_cmd, self.mock_ssh, "command") def test_exec_ssh_cmd_exit_code_127(self): self.mock_stdout.read.return_value = b'' @@ -410,11 +424,8 @@ def test_exec_ssh_cmd_exit_code_127(self): self.mock_ssh.exec_command.return_value = (None, self.mock_stdout, self.mock_stdout) - original_exec_ssh_cmd = testutils.get_wrapped_function( - utils.exec_ssh_cmd) - self.assertRaises(exception.SSHCommandNotFoundException, - original_exec_ssh_cmd, self.mock_ssh, "command") + utils.exec_ssh_cmd, self.mock_ssh, "command") def test_exec_ssh_cmd_chroot(self): self.mock_stdout.read.return_value = b'output\n' diff --git a/coriolis/utils.py b/coriolis/utils.py index 0655e05a..592e888e 100644 --- a/coriolis/utils.py +++ b/coriolis/utils.py @@ -168,7 +168,8 @@ def get_single_result(lis): def retry_on_error(max_attempts=5, sleep_seconds=1, - terminal_exceptions=[]): + terminal_exceptions=[], + retried_exceptions=(Exception, )): def _retry_on_error(func): @functools.wraps(func) def _exec_retry(*args, **kwargs): @@ -180,7 +181,7 @@ def _exec_retry(*args, **kwargs): LOG.debug("Got a KeyboardInterrupt, skip retrying") LOG.exception(ex) raise - except Exception as ex: + except retried_exceptions as ex: if any([isinstance(ex, tex) for tex in terminal_exceptions]): raise @@ -314,10 +315,7 @@ def list_ssh_dir(ssh, remote_path): return [f for f in output.splitlines() if f.strip()] -@retry_on_error(terminal_exceptions=[ - exception.MinionMachineCommandTimeout, - exception.SSHCommandNotFoundException]) -def exec_ssh_cmd(ssh, cmd, environment=None, get_pty=False, timeout=None): +def _exec_ssh_cmd(ssh, cmd, environment=None, get_pty=False, timeout=None): remote_str = "" if timeout is not None: timeout = float(timeout) @@ -350,7 +348,7 @@ def exec_ssh_cmd(ssh, cmd, environment=None, get_pty=False, timeout=None): "command not found" in stdout_str or "command not found" in stderr_str): raise exception.SSHCommandNotFoundException(msg) - raise exception.CoriolisException(msg) + raise exception.SSHCommandFailed(msg) # Most of the commands will use pseudo-terminal which unfortunately will # include a '\r' to every newline. This will affect all plugins too, so # best we can do now is replace them. @@ -360,6 +358,34 @@ def exec_ssh_cmd(ssh, cmd, environment=None, get_pty=False, timeout=None): return std_out.decode('utf-8', errors='replace') +def exec_ssh_cmd( + ssh, + cmd, + environment=None, + get_pty=False, + timeout=None, + max_attempts=5, + retry_interval=1, + terminal_exceptions=( + exception.MinionMachineCommandTimeout, + exception.SSHCommandNotFoundException, + ), + retried_exceptions=(paramiko.ssh_exception.SSHException,), +): + # By default, we'll perform retires only in case of SSH failures. + # + # Add "SSHCommandFailed" to the list of retried exceptions if failed + # commands should be retried as well. + + @retry_on_error(max_attempts=max_attempts, sleep_seconds=retry_interval, + terminal_exceptions=terminal_exceptions, + retried_exceptions=retried_exceptions) + def wrapper(): + return _exec_ssh_cmd(ssh, cmd, environment, get_pty, timeout) + + return wrapper() + + def exec_ssh_cmd_chroot(ssh, chroot_dir, cmd, environment=None, get_pty=False, timeout=None): return exec_ssh_cmd(ssh, "sudo -E chroot %s %s" % (chroot_dir, cmd),