From 0deb4c4803ca78cfab284b5ba803b905f929d5eb Mon Sep 17 00:00:00 2001 From: jmsperu Date: Sat, 23 May 2026 00:56:30 +0300 Subject: [PATCH] fix(nas-backup): timeout, free-space check, trap-based cleanup with VM resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three independent reliability fixes for the KVM NAS backup script, layered on top of the existing quiesce + EXIT_CLEANUP_FAILED groundwork: 1. BACKUP_TIMEOUT env var (default 6h) bounds the libvirt domjobinfo wait loop in backup_running_vm. Today a stuck QEMU backup holds the agent's command slot until the orchestrator-level timeout fires. The new guard issues domjobabort and exits non-zero so the agent reclaims the slot promptly. 2. MIN_FREE_SPACE env var (default 1 GiB) + check_free_space() runs after mount and before any qemu-img convert in both backup_running_vm and backup_stopped_vm. Fail-fast on a near-full NAS instead of failing mid-write halfway through a multi-GiB convert. 3. trap cleanup EXIT replaces the six explicit cleanup() call sites as the primary cleanup mechanism so orphan NFS mounts no longer accumulate when the script dies to SIGTERM, SIGINT, or any uncaught set -e failure between the explicit call sites. cleanup() is now guarded by CLEANUP_DONE so the trap doesn't re-run an already-completed cleanup from an explicit call. cleanup() additionally resumes the VM if it's still paused — backup-begin holds the guest paused briefly and a failed backup mid-pause currently leaves the guest stuck in 'paused' state until an operator intervenes. Targets main; supersedes the 4.20-targeted version of this PR. --- scripts/vm/hypervisor/kvm/nasbackup.sh | 74 ++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/scripts/vm/hypervisor/kvm/nasbackup.sh b/scripts/vm/hypervisor/kvm/nasbackup.sh index 441312f35e86..73533d68a0a7 100755 --- a/scripts/vm/hypervisor/kvm/nasbackup.sh +++ b/scripts/vm/hypervisor/kvm/nasbackup.sh @@ -37,6 +37,20 @@ logFile="/var/log/cloudstack/agent/agent.log" EXIT_CLEANUP_FAILED=20 +# Backup job timeout in seconds (default: 6 hours). Guards the libvirt +# domjobinfo wait loop so a stuck QEMU backup eventually fails the script +# instead of holding the agent's command slot indefinitely. +BACKUP_TIMEOUT=${BACKUP_TIMEOUT:-21600} + +# Minimum free space required on the mounted backup target, in bytes +# (default: 1 GiB). Checked after mount, before any qemu-img convert, so +# we fail fast rather than mid-write when the NAS is near-full. +MIN_FREE_SPACE=${MIN_FREE_SPACE:-1073741824} + +# Guards cleanup() against double-execution when both an explicit call +# and the EXIT trap fire (e.g. error path calls cleanup; exit 1 → trap). +CLEANUP_DONE=0 + log() { [[ "$verb" -eq 1 ]] && builtin echo "$@" if [[ "$1" == "-ne" || "$1" == "-e" || "$1" == "-n" ]]; then @@ -111,6 +125,7 @@ get_linstor_uuid_from_path() { backup_running_vm() { mount_operation + check_free_space mkdir -p "$dest" || { echo "Failed to create backup directory $dest"; exit 1; } name="root" @@ -160,6 +175,9 @@ backup_running_vm() { virsh -c qemu:///system domiflist $VM > $dest/domiflist.xml 2>/dev/null virsh -c qemu:///system domblklist $VM > $dest/domblklist.xml 2>/dev/null + # Bound the wait so a stuck QEMU backup eventually aborts instead of holding + # the agent's command slot until the orchestrator-level timeout fires. + local elapsed=0 while true; do status=$(virsh -c qemu:///system domjobinfo $VM --completed --keep-completed | awk '/Job type:/ {print $3}') case "$status" in @@ -169,7 +187,13 @@ backup_running_vm() { echo "Virsh backup job failed" cleanup ;; esac + if [[ $elapsed -ge $BACKUP_TIMEOUT ]]; then + echo "Backup timed out after ${BACKUP_TIMEOUT}s for VM $VM" + virsh -c qemu:///system domjobabort $VM > /dev/null 2>&1 || true + exit 1 + fi sleep 5 + elapsed=$((elapsed + 5)) done # Use qemu-img convert to sparsify linstor backups which get bloated due to virsh backup-begin. @@ -204,6 +228,7 @@ backup_running_vm() { backup_stopped_vm() { mount_operation + check_free_space mkdir -p "$dest" || { echo "Failed to create backup directory $dest"; exit 1; } IFS="," @@ -262,12 +287,50 @@ mount_operation() { fi } +check_free_space() { + local free_bytes + free_bytes=$(df -P "$mount_point" 2>/dev/null | awk 'NR==2 {print $4}') + if [[ -n "$free_bytes" ]]; then + # df -P reports 1K blocks; convert to bytes. + free_bytes=$((free_bytes * 1024)) + if [[ $free_bytes -lt $MIN_FREE_SPACE ]]; then + echo "Insufficient free space on backup target: $((free_bytes / 1048576)) MB available, $((MIN_FREE_SPACE / 1048576)) MB required" + exit 1 + fi + log -ne "Backup target has $((free_bytes / 1073741824)) GB free space" + fi +} + cleanup() { + # Idempotent: skip if a prior explicit call already ran. Without this guard, + # the EXIT trap would re-run cleanup and fail on the already-unmounted point. + [[ $CLEANUP_DONE -eq 1 ]] && return 0 + CLEANUP_DONE=1 + local status=0 - rm -rf "$dest" || { echo "Failed to delete $dest"; status=1; } - umount "$mount_point" || { echo "Failed to unmount $mount_point"; status=1; } - rmdir "$mount_point" || { echo "Failed to remove mount point $mount_point"; status=1; } + # If the VM was paused mid-backup (e.g. backup-begin succeeded but the script + # is exiting on error or signal), resume it. Without this a failed backup + # leaves the guest stuck in 'paused' state until an operator intervenes. + if [[ -n "$VM" ]]; then + local vm_state + vm_state=$(virsh -c qemu:///system domstate "$VM" 2>/dev/null || true) + if [[ "$vm_state" == "paused" ]]; then + log -ne "Resuming paused VM $VM during backup cleanup" + if ! virsh -c qemu:///system resume "$VM" > /dev/null 2>&1; then + echo "Failed to resume VM $VM" + status=1 + fi + fi + fi + + if [[ -n "$dest" && -d "$dest" ]]; then + rm -rf "$dest" || { echo "Failed to delete $dest"; status=1; } + fi + if [[ -n "$mount_point" && -d "$mount_point" ]]; then + umount "$mount_point" 2>/dev/null || { echo "Failed to unmount $mount_point"; status=1; } + rmdir "$mount_point" 2>/dev/null || true + fi if [[ $status -ne 0 ]]; then echo "Backup cleanup failed" @@ -275,6 +338,11 @@ cleanup() { fi } +# Trap ensures cleanup runs on any exit path — including SIGTERM/SIGINT and +# unexpected errors caught by set -e — not just the explicit failure branches. +# Prevents orphan NFS mounts from accumulating after non-graceful exits. +trap cleanup EXIT + function usage { echo "" echo "Usage: $0 -o -v|--vm -t -s -m -p -d -q|--quiesce "