From 0deb4c4803ca78cfab284b5ba803b905f929d5eb Mon Sep 17 00:00:00 2001 From: jmsperu Date: Sat, 23 May 2026 00:56:30 +0300 Subject: [PATCH 1/3] fix(nas-backup): timeout, free-space check, trap-based cleanup with VM resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three independent reliability fixes for the KVM NAS backup script, layered on top of the existing quiesce + EXIT_CLEANUP_FAILED groundwork: 1. BACKUP_TIMEOUT env var (default 6h) bounds the libvirt domjobinfo wait loop in backup_running_vm. Today a stuck QEMU backup holds the agent's command slot until the orchestrator-level timeout fires. The new guard issues domjobabort and exits non-zero so the agent reclaims the slot promptly. 2. MIN_FREE_SPACE env var (default 1 GiB) + check_free_space() runs after mount and before any qemu-img convert in both backup_running_vm and backup_stopped_vm. Fail-fast on a near-full NAS instead of failing mid-write halfway through a multi-GiB convert. 3. trap cleanup EXIT replaces the six explicit cleanup() call sites as the primary cleanup mechanism so orphan NFS mounts no longer accumulate when the script dies to SIGTERM, SIGINT, or any uncaught set -e failure between the explicit call sites. cleanup() is now guarded by CLEANUP_DONE so the trap doesn't re-run an already-completed cleanup from an explicit call. cleanup() additionally resumes the VM if it's still paused — backup-begin holds the guest paused briefly and a failed backup mid-pause currently leaves the guest stuck in 'paused' state until an operator intervenes. Targets main; supersedes the 4.20-targeted version of this PR. --- scripts/vm/hypervisor/kvm/nasbackup.sh | 74 ++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/scripts/vm/hypervisor/kvm/nasbackup.sh b/scripts/vm/hypervisor/kvm/nasbackup.sh index 441312f35e86..73533d68a0a7 100755 --- a/scripts/vm/hypervisor/kvm/nasbackup.sh +++ b/scripts/vm/hypervisor/kvm/nasbackup.sh @@ -37,6 +37,20 @@ logFile="/var/log/cloudstack/agent/agent.log" EXIT_CLEANUP_FAILED=20 +# Backup job timeout in seconds (default: 6 hours). Guards the libvirt +# domjobinfo wait loop so a stuck QEMU backup eventually fails the script +# instead of holding the agent's command slot indefinitely. +BACKUP_TIMEOUT=${BACKUP_TIMEOUT:-21600} + +# Minimum free space required on the mounted backup target, in bytes +# (default: 1 GiB). Checked after mount, before any qemu-img convert, so +# we fail fast rather than mid-write when the NAS is near-full. +MIN_FREE_SPACE=${MIN_FREE_SPACE:-1073741824} + +# Guards cleanup() against double-execution when both an explicit call +# and the EXIT trap fire (e.g. error path calls cleanup; exit 1 → trap). +CLEANUP_DONE=0 + log() { [[ "$verb" -eq 1 ]] && builtin echo "$@" if [[ "$1" == "-ne" || "$1" == "-e" || "$1" == "-n" ]]; then @@ -111,6 +125,7 @@ get_linstor_uuid_from_path() { backup_running_vm() { mount_operation + check_free_space mkdir -p "$dest" || { echo "Failed to create backup directory $dest"; exit 1; } name="root" @@ -160,6 +175,9 @@ backup_running_vm() { virsh -c qemu:///system domiflist $VM > $dest/domiflist.xml 2>/dev/null virsh -c qemu:///system domblklist $VM > $dest/domblklist.xml 2>/dev/null + # Bound the wait so a stuck QEMU backup eventually aborts instead of holding + # the agent's command slot until the orchestrator-level timeout fires. + local elapsed=0 while true; do status=$(virsh -c qemu:///system domjobinfo $VM --completed --keep-completed | awk '/Job type:/ {print $3}') case "$status" in @@ -169,7 +187,13 @@ backup_running_vm() { echo "Virsh backup job failed" cleanup ;; esac + if [[ $elapsed -ge $BACKUP_TIMEOUT ]]; then + echo "Backup timed out after ${BACKUP_TIMEOUT}s for VM $VM" + virsh -c qemu:///system domjobabort $VM > /dev/null 2>&1 || true + exit 1 + fi sleep 5 + elapsed=$((elapsed + 5)) done # Use qemu-img convert to sparsify linstor backups which get bloated due to virsh backup-begin. @@ -204,6 +228,7 @@ backup_running_vm() { backup_stopped_vm() { mount_operation + check_free_space mkdir -p "$dest" || { echo "Failed to create backup directory $dest"; exit 1; } IFS="," @@ -262,12 +287,50 @@ mount_operation() { fi } +check_free_space() { + local free_bytes + free_bytes=$(df -P "$mount_point" 2>/dev/null | awk 'NR==2 {print $4}') + if [[ -n "$free_bytes" ]]; then + # df -P reports 1K blocks; convert to bytes. + free_bytes=$((free_bytes * 1024)) + if [[ $free_bytes -lt $MIN_FREE_SPACE ]]; then + echo "Insufficient free space on backup target: $((free_bytes / 1048576)) MB available, $((MIN_FREE_SPACE / 1048576)) MB required" + exit 1 + fi + log -ne "Backup target has $((free_bytes / 1073741824)) GB free space" + fi +} + cleanup() { + # Idempotent: skip if a prior explicit call already ran. Without this guard, + # the EXIT trap would re-run cleanup and fail on the already-unmounted point. + [[ $CLEANUP_DONE -eq 1 ]] && return 0 + CLEANUP_DONE=1 + local status=0 - rm -rf "$dest" || { echo "Failed to delete $dest"; status=1; } - umount "$mount_point" || { echo "Failed to unmount $mount_point"; status=1; } - rmdir "$mount_point" || { echo "Failed to remove mount point $mount_point"; status=1; } + # If the VM was paused mid-backup (e.g. backup-begin succeeded but the script + # is exiting on error or signal), resume it. Without this a failed backup + # leaves the guest stuck in 'paused' state until an operator intervenes. + if [[ -n "$VM" ]]; then + local vm_state + vm_state=$(virsh -c qemu:///system domstate "$VM" 2>/dev/null || true) + if [[ "$vm_state" == "paused" ]]; then + log -ne "Resuming paused VM $VM during backup cleanup" + if ! virsh -c qemu:///system resume "$VM" > /dev/null 2>&1; then + echo "Failed to resume VM $VM" + status=1 + fi + fi + fi + + if [[ -n "$dest" && -d "$dest" ]]; then + rm -rf "$dest" || { echo "Failed to delete $dest"; status=1; } + fi + if [[ -n "$mount_point" && -d "$mount_point" ]]; then + umount "$mount_point" 2>/dev/null || { echo "Failed to unmount $mount_point"; status=1; } + rmdir "$mount_point" 2>/dev/null || true + fi if [[ $status -ne 0 ]]; then echo "Backup cleanup failed" @@ -275,6 +338,11 @@ cleanup() { fi } +# Trap ensures cleanup runs on any exit path — including SIGTERM/SIGINT and +# unexpected errors caught by set -e — not just the explicit failure branches. +# Prevents orphan NFS mounts from accumulating after non-graceful exits. +trap cleanup EXIT + function usage { echo "" echo "Usage: $0 -o -v|--vm -t -s -m -p -d -q|--quiesce " From 9cbf928c612e65da6869d2ffc2af0198aff5242c Mon Sep 17 00:00:00 2001 From: jmsperu Date: Tue, 26 May 2026 21:22:05 +0300 Subject: [PATCH 2/3] ci: retrigger workflow (flaky/stale shards) From d64286b52b2426ef111bf8a80ecdcbce894707fa Mon Sep 17 00:00:00 2001 From: jmsperu Date: Sun, 21 Jun 2026 20:58:44 +0300 Subject: [PATCH 3/3] fix(nas-backup): fail fast on failure + don't delete successful backups (#12843) Address review feedback on #12843 (verified with a stubbed self-test): - domjobinfo 'Failed' branch: exit after cleanup instead of looping until BACKUP_TIMEOUT (the loop never broke on a failed virsh job). - backup_stopped_vm qemu-img convert failure: exit after cleanup so a partial backup is not treated as success; append to the agent log instead of truncating. - backup_stopped_vm success: unmount before returning, mirroring backup_running_vm/ delete_backup. Without this the 'trap cleanup EXIT' fired on success and ran 'rm -rf $dest', deleting the just-completed backup. - mount_operation: test mount's own exit in an if-condition so 'set -eo pipefail' no longer aborts before the check and a piped tee can't mask a mount failure. - cleanup: only umount when the path is actually a mount, so a non-mount umount error doesn't wrongly report EXIT_CLEANUP_FAILED via the EXIT trap. --- scripts/vm/hypervisor/kvm/nasbackup.sh | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/scripts/vm/hypervisor/kvm/nasbackup.sh b/scripts/vm/hypervisor/kvm/nasbackup.sh index 73533d68a0a7..1f0b98f21e1c 100755 --- a/scripts/vm/hypervisor/kvm/nasbackup.sh +++ b/scripts/vm/hypervisor/kvm/nasbackup.sh @@ -185,7 +185,8 @@ backup_running_vm() { break ;; Failed) echo "Virsh backup job failed" - cleanup ;; + cleanup + exit 1 ;; esac if [[ $elapsed -ge $BACKUP_TIMEOUT ]]; then echo "Backup timed out after ${BACKUP_TIMEOUT}s for VM $VM" @@ -243,15 +244,21 @@ backup_stopped_vm() { volUuid="${disk##*/}" fi output="$dest/$name.$volUuid.qcow2" - if ! qemu-img convert -O qcow2 "$disk" "$output" > "$logFile" 2> >(cat >&2); then + if ! qemu-img convert -O qcow2 "$disk" "$output" >> "$logFile" 2> >(cat >&2); then echo "qemu-img convert failed for $disk $output" cleanup + exit 1 fi name="datadisk" done sync ls -l --numeric-uid-gid $dest | awk '{print $5}' + + # Unmount on success so the EXIT trap's cleanup (which removes an incomplete $dest) + # does not delete the just-completed backup. Mirrors backup_running_vm/delete_backup. + umount $mount_point + rmdir $mount_point } delete_backup() { @@ -278,8 +285,10 @@ mount_operation() { if [ ${NAS_TYPE} == "cifs" ]; then MOUNT_OPTS="${MOUNT_OPTS},nobrl" fi - mount -t ${NAS_TYPE} ${NAS_ADDRESS} ${mount_point} $([[ ! -z "${MOUNT_OPTS}" ]] && echo -o ${MOUNT_OPTS}) 2>&1 | tee -a "$logFile" - if [ $? -eq 0 ]; then + # Test mount's own exit in an if-condition: under `set -eo pipefail` the previous + # `mount ... | tee` form aborted before the check on failure, and `$?` captured tee's + # status (always 0), masking mount failures. Output is appended to the agent log. + if mount -t "${NAS_TYPE}" "${NAS_ADDRESS}" "${mount_point}" $([[ -n "${MOUNT_OPTS}" ]] && echo -o "${MOUNT_OPTS}") >> "$logFile" 2>&1; then log -ne "Successfully mounted ${NAS_TYPE} store" else echo "Failed to mount ${NAS_TYPE} store" @@ -328,7 +337,11 @@ cleanup() { rm -rf "$dest" || { echo "Failed to delete $dest"; status=1; } fi if [[ -n "$mount_point" && -d "$mount_point" ]]; then - umount "$mount_point" 2>/dev/null || { echo "Failed to unmount $mount_point"; status=1; } + # Only umount if it is actually a mount — otherwise (mount failed, or stats/delete + # already unmounted) a umount error would wrongly flag cleanup as failed via the EXIT trap. + if mountpoint -q "$mount_point"; then + umount "$mount_point" 2>/dev/null || { echo "Failed to unmount $mount_point"; status=1; } + fi rmdir "$mount_point" 2>/dev/null || true fi