From 0deb4c4803ca78cfab284b5ba803b905f929d5eb Mon Sep 17 00:00:00 2001
From: jmsperu <jmsperu@users.noreply.github.com>
Date: Sat, 23 May 2026 00:56:30 +0300
Subject: [PATCH 1/3] fix(nas-backup): timeout, free-space check, trap-based
 cleanup with VM resume
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three independent reliability fixes for the KVM NAS backup script, layered on
top of the existing quiesce + EXIT_CLEANUP_FAILED groundwork:

1. BACKUP_TIMEOUT env var (default 6h) bounds the libvirt domjobinfo wait
   loop in backup_running_vm. Today a stuck QEMU backup holds the agent's
   command slot until the orchestrator-level timeout fires. The new guard
   issues domjobabort and exits non-zero so the agent reclaims the slot
   promptly.

2. MIN_FREE_SPACE env var (default 1 GiB) + check_free_space() runs after
   mount and before any qemu-img convert in both backup_running_vm and
   backup_stopped_vm. Fail-fast on a near-full NAS instead of failing
   mid-write halfway through a multi-GiB convert.

3. trap cleanup EXIT replaces the six explicit cleanup() call sites as the
   primary cleanup mechanism so orphan NFS mounts no longer accumulate when
   the script dies to SIGTERM, SIGINT, or any uncaught set -e failure
   between the explicit call sites. cleanup() is now guarded by
   CLEANUP_DONE so the trap doesn't re-run an already-completed cleanup
   from an explicit call.

cleanup() additionally resumes the VM if it's still paused — backup-begin
holds the guest paused briefly and a failed backup mid-pause currently
leaves the guest stuck in 'paused' state until an operator intervenes.

Targets main; supersedes the 4.20-targeted version of this PR.
---
 scripts/vm/hypervisor/kvm/nasbackup.sh | 74 ++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 3 deletions(-)

diff --git a/scripts/vm/hypervisor/kvm/nasbackup.sh b/scripts/vm/hypervisor/kvm/nasbackup.sh
index 441312f35e86..73533d68a0a7 100755
--- a/scripts/vm/hypervisor/kvm/nasbackup.sh
+++ b/scripts/vm/hypervisor/kvm/nasbackup.sh
@@ -37,6 +37,20 @@ logFile="/var/log/cloudstack/agent/agent.log"
 
 EXIT_CLEANUP_FAILED=20
 
+# Backup job timeout in seconds (default: 6 hours). Guards the libvirt
+# domjobinfo wait loop so a stuck QEMU backup eventually fails the script
+# instead of holding the agent's command slot indefinitely.
+BACKUP_TIMEOUT=${BACKUP_TIMEOUT:-21600}
+
+# Minimum free space required on the mounted backup target, in bytes
+# (default: 1 GiB). Checked after mount, before any qemu-img convert, so
+# we fail fast rather than mid-write when the NAS is near-full.
+MIN_FREE_SPACE=${MIN_FREE_SPACE:-1073741824}
+
+# Guards cleanup() against double-execution when both an explicit call
+# and the EXIT trap fire (e.g. error path calls cleanup; exit 1 → trap).
+CLEANUP_DONE=0
+
 log() {
   [[ "$verb" -eq 1 ]] && builtin echo "$@"
   if [[ "$1" == "-ne"  || "$1" == "-e" || "$1" == "-n" ]]; then
@@ -111,6 +125,7 @@ get_linstor_uuid_from_path() {
 
 backup_running_vm() {
   mount_operation
+  check_free_space
   mkdir -p "$dest" || { echo "Failed to create backup directory $dest"; exit 1; }
 
   name="root"
@@ -160,6 +175,9 @@ backup_running_vm() {
   virsh -c qemu:///system domiflist $VM > $dest/domiflist.xml 2>/dev/null
   virsh -c qemu:///system domblklist $VM > $dest/domblklist.xml 2>/dev/null
 
+  # Bound the wait so a stuck QEMU backup eventually aborts instead of holding
+  # the agent's command slot until the orchestrator-level timeout fires.
+  local elapsed=0
   while true; do
     status=$(virsh -c qemu:///system domjobinfo $VM --completed --keep-completed | awk '/Job type:/ {print $3}')
     case "$status" in
@@ -169,7 +187,13 @@ backup_running_vm() {
         echo "Virsh backup job failed"
         cleanup ;;
     esac
+    if [[ $elapsed -ge $BACKUP_TIMEOUT ]]; then
+      echo "Backup timed out after ${BACKUP_TIMEOUT}s for VM $VM"
+      virsh -c qemu:///system domjobabort $VM > /dev/null 2>&1 || true
+      exit 1
+    fi
     sleep 5
+    elapsed=$((elapsed + 5))
   done
 
   # Use qemu-img convert to sparsify linstor backups which get bloated due to virsh backup-begin.
@@ -204,6 +228,7 @@ backup_running_vm() {
 
 backup_stopped_vm() {
   mount_operation
+  check_free_space
   mkdir -p "$dest" || { echo "Failed to create backup directory $dest"; exit 1; }
 
   IFS=","
@@ -262,12 +287,50 @@ mount_operation() {
   fi
 }
 
+check_free_space() {
+  local free_bytes
+  free_bytes=$(df -P "$mount_point" 2>/dev/null | awk 'NR==2 {print $4}')
+  if [[ -n "$free_bytes" ]]; then
+    # df -P reports 1K blocks; convert to bytes.
+    free_bytes=$((free_bytes * 1024))
+    if [[ $free_bytes -lt $MIN_FREE_SPACE ]]; then
+      echo "Insufficient free space on backup target: $((free_bytes / 1048576)) MB available, $((MIN_FREE_SPACE / 1048576)) MB required"
+      exit 1
+    fi
+    log -ne "Backup target has $((free_bytes / 1073741824)) GB free space"
+  fi
+}
+
 cleanup() {
+  # Idempotent: skip if a prior explicit call already ran. Without this guard,
+  # the EXIT trap would re-run cleanup and fail on the already-unmounted point.
+  [[ $CLEANUP_DONE -eq 1 ]] && return 0
+  CLEANUP_DONE=1
+
   local status=0
 
-  rm -rf "$dest" || { echo "Failed to delete $dest"; status=1; }
-  umount "$mount_point" || { echo "Failed to unmount $mount_point"; status=1; }
-  rmdir "$mount_point" || { echo "Failed to remove mount point $mount_point"; status=1; }
+  # If the VM was paused mid-backup (e.g. backup-begin succeeded but the script
+  # is exiting on error or signal), resume it. Without this a failed backup
+  # leaves the guest stuck in 'paused' state until an operator intervenes.
+  if [[ -n "$VM" ]]; then
+    local vm_state
+    vm_state=$(virsh -c qemu:///system domstate "$VM" 2>/dev/null || true)
+    if [[ "$vm_state" == "paused" ]]; then
+      log -ne "Resuming paused VM $VM during backup cleanup"
+      if ! virsh -c qemu:///system resume "$VM" > /dev/null 2>&1; then
+        echo "Failed to resume VM $VM"
+        status=1
+      fi
+    fi
+  fi
+
+  if [[ -n "$dest" && -d "$dest" ]]; then
+    rm -rf "$dest" || { echo "Failed to delete $dest"; status=1; }
+  fi
+  if [[ -n "$mount_point" && -d "$mount_point" ]]; then
+    umount "$mount_point" 2>/dev/null || { echo "Failed to unmount $mount_point"; status=1; }
+    rmdir "$mount_point" 2>/dev/null || true
+  fi
 
   if [[ $status -ne 0 ]]; then
     echo "Backup cleanup failed"
@@ -275,6 +338,11 @@ cleanup() {
   fi
 }
 
+# Trap ensures cleanup runs on any exit path — including SIGTERM/SIGINT and
+# unexpected errors caught by set -e — not just the explicit failure branches.
+# Prevents orphan NFS mounts from accumulating after non-graceful exits.
+trap cleanup EXIT
+
 function usage {
   echo ""
   echo "Usage: $0 -o <operation> -v|--vm <domain name> -t <storage type> -s <storage address> -m <mount options> -p <backup path> -d <disks path> -q|--quiesce <true|false>"

From 9cbf928c612e65da6869d2ffc2af0198aff5242c Mon Sep 17 00:00:00 2001
From: jmsperu <jmsperu@users.noreply.github.com>
Date: Tue, 26 May 2026 21:22:05 +0300
Subject: [PATCH 2/3] ci: retrigger workflow (flaky/stale shards)


From d64286b52b2426ef111bf8a80ecdcbce894707fa Mon Sep 17 00:00:00 2001
From: jmsperu <jmsperu@users.noreply.github.com>
Date: Sun, 21 Jun 2026 20:58:44 +0300
Subject: [PATCH 3/3] fix(nas-backup): fail fast on failure + don't delete
 successful backups (#12843)

Address review feedback on #12843 (verified with a stubbed self-test):
- domjobinfo 'Failed' branch: exit after cleanup instead of looping until
  BACKUP_TIMEOUT (the loop never broke on a failed virsh job).
- backup_stopped_vm qemu-img convert failure: exit after cleanup so a partial
  backup is not treated as success; append to the agent log instead of truncating.
- backup_stopped_vm success: unmount before returning, mirroring backup_running_vm/
  delete_backup. Without this the 'trap cleanup EXIT' fired on success and ran
  'rm -rf $dest', deleting the just-completed backup.
- mount_operation: test mount's own exit in an if-condition so 'set -eo pipefail'
  no longer aborts before the check and a piped tee can't mask a mount failure.
- cleanup: only umount when the path is actually a mount, so a non-mount umount
  error doesn't wrongly report EXIT_CLEANUP_FAILED via the EXIT trap.
---
 scripts/vm/hypervisor/kvm/nasbackup.sh | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/scripts/vm/hypervisor/kvm/nasbackup.sh b/scripts/vm/hypervisor/kvm/nasbackup.sh
index 73533d68a0a7..1f0b98f21e1c 100755
--- a/scripts/vm/hypervisor/kvm/nasbackup.sh
+++ b/scripts/vm/hypervisor/kvm/nasbackup.sh
@@ -185,7 +185,8 @@ backup_running_vm() {
         break ;;
       Failed)
         echo "Virsh backup job failed"
-        cleanup ;;
+        cleanup
+        exit 1 ;;
     esac
     if [[ $elapsed -ge $BACKUP_TIMEOUT ]]; then
       echo "Backup timed out after ${BACKUP_TIMEOUT}s for VM $VM"
@@ -243,15 +244,21 @@ backup_stopped_vm() {
       volUuid="${disk##*/}"
     fi
     output="$dest/$name.$volUuid.qcow2"
-    if ! qemu-img convert -O qcow2 "$disk" "$output" > "$logFile" 2> >(cat >&2); then
+    if ! qemu-img convert -O qcow2 "$disk" "$output" >> "$logFile" 2> >(cat >&2); then
       echo "qemu-img convert failed for $disk $output"
       cleanup
+      exit 1
     fi
     name="datadisk"
   done
   sync
 
   ls -l --numeric-uid-gid $dest | awk '{print $5}'
+
+  # Unmount on success so the EXIT trap's cleanup (which removes an incomplete $dest)
+  # does not delete the just-completed backup. Mirrors backup_running_vm/delete_backup.
+  umount $mount_point
+  rmdir $mount_point
 }
 
 delete_backup() {
@@ -278,8 +285,10 @@ mount_operation() {
   if [ ${NAS_TYPE} == "cifs" ]; then
     MOUNT_OPTS="${MOUNT_OPTS},nobrl"
   fi
-  mount -t ${NAS_TYPE} ${NAS_ADDRESS} ${mount_point} $([[ ! -z "${MOUNT_OPTS}" ]] && echo -o ${MOUNT_OPTS}) 2>&1 | tee -a "$logFile"
-  if [ $? -eq 0 ]; then
+  # Test mount's own exit in an if-condition: under `set -eo pipefail` the previous
+  # `mount ... | tee` form aborted before the check on failure, and `$?` captured tee's
+  # status (always 0), masking mount failures. Output is appended to the agent log.
+  if mount -t "${NAS_TYPE}" "${NAS_ADDRESS}" "${mount_point}" $([[ -n "${MOUNT_OPTS}" ]] && echo -o "${MOUNT_OPTS}") >> "$logFile" 2>&1; then
       log -ne "Successfully mounted ${NAS_TYPE} store"
   else
       echo "Failed to mount ${NAS_TYPE} store"
@@ -328,7 +337,11 @@ cleanup() {
     rm -rf "$dest" || { echo "Failed to delete $dest"; status=1; }
   fi
   if [[ -n "$mount_point" && -d "$mount_point" ]]; then
-    umount "$mount_point" 2>/dev/null || { echo "Failed to unmount $mount_point"; status=1; }
+    # Only umount if it is actually a mount — otherwise (mount failed, or stats/delete
+    # already unmounted) a umount error would wrongly flag cleanup as failed via the EXIT trap.
+    if mountpoint -q "$mount_point"; then
+      umount "$mount_point" 2>/dev/null || { echo "Failed to unmount $mount_point"; status=1; }
+    fi
     rmdir "$mount_point" 2>/dev/null || true
   fi