lavacano · April 16, 2026 03:24
diff --git a/sriov-watchdog.sh crontab version b/sriov-watchdog.sh crontab version
 #!/bin/bash
 # ============================================================================
 # SR-IOV VF Watchdog for Proxmox VE
 # ============================================================================
 # Monitors VMs using SR-IOV VFs from a BCM57810 PF for network degradation.
 # When detected, gracefully stops affected VMs, resets SR-IOV VFs, and
 # restarts them.
 #
 # Root cause: bnx2x VF PCIe link can degrade to Width x0, causing transmit
 # timeouts, RCU stalls, and cascading failures inside the VM.
 #
 # Install:
 #   cp sriov-watchdog.sh /usr/local/bin/sriov-watchdog.sh
 #   chmod +x /usr/local/bin/sriov-watchdog.sh
 #   mkdir -p /var/lib/sriov-watchdog
 #
 # Cron (every minute):
 #   * * * * * /usr/local/bin/sriov-watchdog.sh >> /var/log/sriov-watchdog.log 2>&1
 # ============================================================================

 set -euo pipefail

 # Ensure full PATH for cron (qm is in /usr/sbin)
 export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"

 # ── Configuration ───────────────────────────────────────────────────────────
 # VMs to monitor (must ALL be stopped before SR-IOV reset)
 declare -A VM_IPS
 VM_IPS[9302]="192.168.40.22"
 VM_IPS[9307]="192.168.40.27"   # <-- UPDATE THIS to 9307's actual IP

 # SR-IOV Physical Function PCI address
 PF_PCI="0000:04:00.0"
 SRIOV_NUMVFS=3

 # Detection tuning
 PING_COUNT=3                    # pings per check
 PING_TIMEOUT=2                  # seconds per ping
 FAIL_THRESHOLD=4                # failures within window before action
 FAIL_WINDOW_SECONDS=600         # 10 min sliding window for failures
 RECOVERY_THRESHOLD=5            # consecutive successes to clear failure history
 COOLDOWN_SECONDS=600            # 10 min cooldown after a reset (avoid loops)

 # Paths
 STATE_DIR="/var/lib/sriov-watchdog"
 LOCK_FILE="/var/run/sriov-watchdog.lock"
 # ── End Configuration ───────────────────────────────────────────────────────

 log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
 }

 log_warn() {
    log "WARNING: $*"
 }

 log_err() {
    log "ERROR: $*"
 }

 # Ensure state directory exists
 mkdir -p "$STATE_DIR"

 # ── Lock (prevent concurrent runs) ─────────────────────────────────────────
 exec 200>"$LOCK_FILE"
 if ! flock -n 200; then
    log "Another instance is running. Exiting."
    exit 0
 fi

 # ── Cooldown check ──────────────────────────────────────────────────────────
 COOLDOWN_FILE="$STATE_DIR/last_reset"
 if [[ -f "$COOLDOWN_FILE" ]]; then
    last_reset=$(cat "$COOLDOWN_FILE")
    now=$(date +%s)
    elapsed=$(( now - last_reset ))
    if (( elapsed < COOLDOWN_SECONDS )); then
        remaining=$(( COOLDOWN_SECONDS - elapsed ))
        log "In cooldown period (${remaining}s remaining). Skipping."
        exit 0
    fi
 fi

 # ── Check if VMs are running ───────────────────────────────────────────────
 any_running=false
 declare -A vm_running
 for vmid in "${!VM_IPS[@]}"; do
    status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') || status="unknown"
    if [[ "$status" == "running" ]]; then
        vm_running[$vmid]=true
        any_running=true
    else
        vm_running[$vmid]=false
    fi
 done

 if [[ "$any_running" == "false" ]]; then
    log "No monitored VMs are running. Nothing to check."
    # Clear any failure counters since VMs are down
    for vmid in "${!VM_IPS[@]}"; do
        rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}"
    done
    exit 0
 fi

 # ── Ping check (time-windowed) ──────────────────────────────────────────────
 needs_reset=false
 now=$(date +%s)

 for vmid in "${!VM_IPS[@]}"; do
    ip="${VM_IPS[$vmid]}"
    fail_file="$STATE_DIR/fails_${vmid}"       # one timestamp per line
    recovery_file="$STATE_DIR/recovery_${vmid}" # consecutive success count

    # Skip VMs that aren't running
    if [[ "${vm_running[$vmid]}" != "true" ]]; then
        rm -f "$fail_file" "$recovery_file"
        continue
    fi

    # Ping the VM
    if ping -c "$PING_COUNT" -W "$PING_TIMEOUT" -q "$ip" &>/dev/null; then
        # VM is reachable — increment recovery counter
        recovery_count=0
        [[ -f "$recovery_file" ]] && recovery_count=$(cat "$recovery_file")
        recovery_count=$(( recovery_count + 1 ))
        echo "$recovery_count" > "$recovery_file"

        if (( recovery_count >= RECOVERY_THRESHOLD )); then
            # Sustained recovery — clear failure history
            if [[ -f "$fail_file" ]]; then
                log "VM $vmid ($ip) recovered ($recovery_count consecutive successes). Clearing failure history."
                rm -f "$fail_file"
            fi
            rm -f "$recovery_file"
        else
            # Count recent failures still in window
            recent=0
            if [[ -f "$fail_file" ]]; then
                while read -r ts; do
                    (( now - ts < FAIL_WINDOW_SECONDS )) && (( recent++ ))
                done < "$fail_file"
            fi
            if (( recent > 0 )); then
                log "VM $vmid ($ip) pingable but only $recovery_count/$RECOVERY_THRESHOLD consecutive successes ($recent recent failures still in window)."
            fi
        fi
    else
        # VM is unreachable — record failure timestamp, reset recovery counter
        echo "$now" >> "$fail_file"
        rm -f "$recovery_file"

        # Prune entries outside the window
        if [[ -f "$fail_file" ]]; then
            tmpfile=$(mktemp)
            while read -r ts; do
                (( now - ts < FAIL_WINDOW_SECONDS )) && echo "$ts"
            done < "$fail_file" > "$tmpfile"
            mv "$tmpfile" "$fail_file"
        fi

        # Count recent failures
        recent_fails=$(wc -l < "$fail_file")

        log_warn "VM $vmid ($ip) unreachable. Failures in last $((FAIL_WINDOW_SECONDS/60))min: $recent_fails/$FAIL_THRESHOLD"

        if (( recent_fails >= FAIL_THRESHOLD )); then
            # Double-check: look for bnx2x errors in host dmesg
            if dmesg --time-format iso 2>/dev/null | tail -500 | grep -qiE "bnx2x.*04:01|vfio-pci.*04:01.*reset"; then
                log_err "VM $vmid ($ip) hit $recent_fails failures in window AND bnx2x/vfio errors detected in dmesg."
                needs_reset=true
            else
                if (( recent_fails >= FAIL_THRESHOLD + 2 )); then
                    log_err "VM $vmid ($ip) hit $recent_fails failures in window (no bnx2x errors, but triggering reset anyway)."
                    needs_reset=true
                else
                    log_warn "VM $vmid ($ip) unreachable but no bnx2x errors in dmesg. Waiting for more failures."
                fi
            fi
        fi
    fi
 done

 if [[ "$needs_reset" != "true" ]]; then
    exit 0
 fi

 # ── SR-IOV Reset Procedure ─────────────────────────────────────────────────
 log "============================================================"
 log "SR-IOV RESET TRIGGERED"
 log "============================================================"

 # Step 1: Stop all monitored VMs (must stop ALL before touching SR-IOV)
 for vmid in "${!VM_IPS[@]}"; do
    if [[ "${vm_running[$vmid]}" == "true" ]]; then
        log "Stopping VM $vmid..."
        qm stop "$vmid" --timeout 120 2>&1 | while read -r line; do log "  qm stop $vmid: $line"; done
        
        # Wait for VM to fully stop
        for i in $(seq 1 30); do
            status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') || status="unknown"
            if [[ "$status" == "stopped" ]]; then
                log "VM $vmid stopped successfully."
                break
            fi
            if (( i == 30 )); then
                log_err "VM $vmid did not stop within 30s after qm stop. Forcing shutdown."
                qm stop "$vmid" --skiplock 2>&1 || true
                sleep 5
            fi
            sleep 1
        done
    fi
 done

 # Step 2: Wait for VFIO to release devices
 log "Waiting 5s for vfio-pci to release VF devices..."
 sleep 5

 # Step 3: Reset SR-IOV VFs
 SRIOV_FILE="/sys/bus/pci/devices/${PF_PCI}/sriov_numvfs"
 if [[ ! -f "$SRIOV_FILE" ]]; then
    log_err "SR-IOV numvfs file not found: $SRIOV_FILE"
    log_err "Aborting reset. Starting VMs back up with potentially bad VFs."
 else
    current_vfs=$(cat "$SRIOV_FILE")
    log "Current SR-IOV VFs: $current_vfs"

    log "Disabling SR-IOV VFs (setting to 0)..."
    echo 0 > "$SRIOV_FILE" 2>&1 || log_err "Failed to set sriov_numvfs to 0"
    sleep 3

    verify=$(cat "$SRIOV_FILE")
    log "SR-IOV VFs after disable: $verify"

    log "Re-enabling SR-IOV VFs (setting to $SRIOV_NUMVFS)..."
    echo "$SRIOV_NUMVFS" > "$SRIOV_FILE" 2>&1 || log_err "Failed to set sriov_numvfs to $SRIOV_NUMVFS"
    sleep 5

    verify=$(cat "$SRIOV_FILE")
    log "SR-IOV VFs after re-enable: $verify"

    if (( verify != SRIOV_NUMVFS )); then
        log_err "SR-IOV VF count mismatch! Expected $SRIOV_NUMVFS, got $verify"
    else
        log "SR-IOV VFs reset successfully."
    fi
 fi

 # Step 4: Start VMs back up (stagger to avoid thundering herd)
 for vmid in "${!VM_IPS[@]}"; do
    log "Starting VM $vmid..."
    qm start "$vmid" 2>&1 | while read -r line; do log "  qm start $vmid: $line"; done
    sleep 10   # stagger starts
 done

 # Step 5: Record reset time and clear failure counters
 date +%s > "$COOLDOWN_FILE"
 for vmid in "${!VM_IPS[@]}"; do
    rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}"
 done

 log "SR-IOV reset complete. VMs restarted. Cooldown active for ${COOLDOWN_SECONDS}s."
 log "============================================================"
	#!/bin/bash
	# ============================================================================
	# SR-IOV VF Watchdog for Proxmox VE
	# ============================================================================
	# Monitors VMs using SR-IOV VFs from a BCM57810 PF for network degradation.
	# When detected, gracefully stops affected VMs, resets SR-IOV VFs, and
	# restarts them.
	#
	# Root cause: bnx2x VF PCIe link can degrade to Width x0, causing transmit
	# timeouts, RCU stalls, and cascading failures inside the VM.
	#
	# Install:
	# cp sriov-watchdog.sh /usr/local/bin/sriov-watchdog.sh
	# chmod +x /usr/local/bin/sriov-watchdog.sh
	# mkdir -p /var/lib/sriov-watchdog
	#
	# Cron (every minute):
	# * * * * * /usr/local/bin/sriov-watchdog.sh >> /var/log/sriov-watchdog.log 2>&1
	# ============================================================================

	set -euo pipefail

	# Ensure full PATH for cron (qm is in /usr/sbin)
	export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"

	# ── Configuration ───────────────────────────────────────────────────────────
	# VMs to monitor (must ALL be stopped before SR-IOV reset)
	declare -A VM_IPS
	VM_IPS[9302]="192.168.40.22"
	VM_IPS[9307]="192.168.40.27" # <-- UPDATE THIS to 9307's actual IP

	# SR-IOV Physical Function PCI address
	PF_PCI="0000:04:00.0"
	SRIOV_NUMVFS=3

	# Detection tuning
	PING_COUNT=3 # pings per check
	PING_TIMEOUT=2 # seconds per ping
	FAIL_THRESHOLD=4 # failures within window before action
	FAIL_WINDOW_SECONDS=600 # 10 min sliding window for failures
	RECOVERY_THRESHOLD=5 # consecutive successes to clear failure history
	COOLDOWN_SECONDS=600 # 10 min cooldown after a reset (avoid loops)

	# Paths
	STATE_DIR="/var/lib/sriov-watchdog"
	LOCK_FILE="/var/run/sriov-watchdog.lock"
	# ── End Configuration ───────────────────────────────────────────────────────

	log() {
	echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
	}

	log_warn() {
	log "WARNING: $*"
	}

	log_err() {
	log "ERROR: $*"
	}

	# Ensure state directory exists
	mkdir -p "$STATE_DIR"

	# ── Lock (prevent concurrent runs) ─────────────────────────────────────────
	exec 200>"$LOCK_FILE"
	if ! flock -n 200; then
	log "Another instance is running. Exiting."
	exit 0
	fi

	# ── Cooldown check ──────────────────────────────────────────────────────────
	COOLDOWN_FILE="$STATE_DIR/last_reset"
	if [[ -f "$COOLDOWN_FILE" ]]; then
	last_reset=$(cat "$COOLDOWN_FILE")
	now=$(date +%s)
	elapsed=$(( now - last_reset ))
	if (( elapsed < COOLDOWN_SECONDS )); then
	remaining=$(( COOLDOWN_SECONDS - elapsed ))
	log "In cooldown period (${remaining}s remaining). Skipping."
	exit 0
	fi
	fi

	# ── Check if VMs are running ───────────────────────────────────────────────
	any_running=false
	declare -A vm_running
	for vmid in "${!VM_IPS[@]}"; do
	status=$(qm status "$vmid" 2>/dev/null \| awk '{print $2}') \|\| status="unknown"
	if [[ "$status" == "running" ]]; then
	vm_running[$vmid]=true
	any_running=true
	else
	vm_running[$vmid]=false
	fi
	done

	if [[ "$any_running" == "false" ]]; then
	log "No monitored VMs are running. Nothing to check."
	# Clear any failure counters since VMs are down
	for vmid in "${!VM_IPS[@]}"; do
	rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}"
	done
	exit 0
	fi

	# ── Ping check (time-windowed) ──────────────────────────────────────────────
	needs_reset=false
	now=$(date +%s)

	for vmid in "${!VM_IPS[@]}"; do
	ip="${VM_IPS[$vmid]}"
	fail_file="$STATE_DIR/fails_${vmid}" # one timestamp per line
	recovery_file="$STATE_DIR/recovery_${vmid}" # consecutive success count

	# Skip VMs that aren't running
	if [[ "${vm_running[$vmid]}" != "true" ]]; then
	rm -f "$fail_file" "$recovery_file"
	continue
	fi

	# Ping the VM
	if ping -c "$PING_COUNT" -W "$PING_TIMEOUT" -q "$ip" &>/dev/null; then
	# VM is reachable — increment recovery counter
	recovery_count=0
	[[ -f "$recovery_file" ]] && recovery_count=$(cat "$recovery_file")
	recovery_count=$(( recovery_count + 1 ))
	echo "$recovery_count" > "$recovery_file"

	if (( recovery_count >= RECOVERY_THRESHOLD )); then
	# Sustained recovery — clear failure history
	if [[ -f "$fail_file" ]]; then
	log "VM $vmid ($ip) recovered ($recovery_count consecutive successes). Clearing failure history."
	rm -f "$fail_file"
	fi
	rm -f "$recovery_file"
	else
	# Count recent failures still in window
	recent=0
	if [[ -f "$fail_file" ]]; then
	while read -r ts; do
	(( now - ts < FAIL_WINDOW_SECONDS )) && (( recent++ ))
	done < "$fail_file"
	fi
	if (( recent > 0 )); then
	log "VM $vmid ($ip) pingable but only $recovery_count/$RECOVERY_THRESHOLD consecutive successes ($recent recent failures still in window)."
	fi
	fi
	else
	# VM is unreachable — record failure timestamp, reset recovery counter
	echo "$now" >> "$fail_file"
	rm -f "$recovery_file"

	# Prune entries outside the window
	if [[ -f "$fail_file" ]]; then
	tmpfile=$(mktemp)
	while read -r ts; do
	(( now - ts < FAIL_WINDOW_SECONDS )) && echo "$ts"
	done < "$fail_file" > "$tmpfile"
	mv "$tmpfile" "$fail_file"
	fi

	# Count recent failures
	recent_fails=$(wc -l < "$fail_file")

	log_warn "VM $vmid ($ip) unreachable. Failures in last $((FAIL_WINDOW_SECONDS/60))min: $recent_fails/$FAIL_THRESHOLD"

	if (( recent_fails >= FAIL_THRESHOLD )); then
	# Double-check: look for bnx2x errors in host dmesg
	if dmesg --time-format iso 2>/dev/null \| tail -500 \| grep -qiE "bnx2x.04:01\|vfio-pci.04:01.*reset"; then
	log_err "VM $vmid ($ip) hit $recent_fails failures in window AND bnx2x/vfio errors detected in dmesg."
	needs_reset=true
	else
	if (( recent_fails >= FAIL_THRESHOLD + 2 )); then
	log_err "VM $vmid ($ip) hit $recent_fails failures in window (no bnx2x errors, but triggering reset anyway)."
	needs_reset=true
	else
	log_warn "VM $vmid ($ip) unreachable but no bnx2x errors in dmesg. Waiting for more failures."
	fi
	fi
	fi
	fi
	done

	if [[ "$needs_reset" != "true" ]]; then
	exit 0
	fi

	# ── SR-IOV Reset Procedure ─────────────────────────────────────────────────
	log "============================================================"
	log "SR-IOV RESET TRIGGERED"
	log "============================================================"

	# Step 1: Stop all monitored VMs (must stop ALL before touching SR-IOV)
	for vmid in "${!VM_IPS[@]}"; do
	if [[ "${vm_running[$vmid]}" == "true" ]]; then
	log "Stopping VM $vmid..."
	qm stop "$vmid" --timeout 120 2>&1 \| while read -r line; do log " qm stop $vmid: $line"; done

	# Wait for VM to fully stop
	for i in $(seq 1 30); do
	status=$(qm status "$vmid" 2>/dev/null \| awk '{print $2}') \|\| status="unknown"
	if [[ "$status" == "stopped" ]]; then
	log "VM $vmid stopped successfully."
	break
	fi
	if (( i == 30 )); then
	log_err "VM $vmid did not stop within 30s after qm stop. Forcing shutdown."
	qm stop "$vmid" --skiplock 2>&1 \|\| true
	sleep 5
	fi
	sleep 1
	done
	fi
	done

	# Step 2: Wait for VFIO to release devices
	log "Waiting 5s for vfio-pci to release VF devices..."
	sleep 5

	# Step 3: Reset SR-IOV VFs
	SRIOV_FILE="/sys/bus/pci/devices/${PF_PCI}/sriov_numvfs"
	if [[ ! -f "$SRIOV_FILE" ]]; then
	log_err "SR-IOV numvfs file not found: $SRIOV_FILE"
	log_err "Aborting reset. Starting VMs back up with potentially bad VFs."
	else
	current_vfs=$(cat "$SRIOV_FILE")
	log "Current SR-IOV VFs: $current_vfs"

	log "Disabling SR-IOV VFs (setting to 0)..."
	echo 0 > "$SRIOV_FILE" 2>&1 \|\| log_err "Failed to set sriov_numvfs to 0"
	sleep 3

	verify=$(cat "$SRIOV_FILE")
	log "SR-IOV VFs after disable: $verify"

	log "Re-enabling SR-IOV VFs (setting to $SRIOV_NUMVFS)..."
	echo "$SRIOV_NUMVFS" > "$SRIOV_FILE" 2>&1 \|\| log_err "Failed to set sriov_numvfs to $SRIOV_NUMVFS"
	sleep 5

	verify=$(cat "$SRIOV_FILE")
	log "SR-IOV VFs after re-enable: $verify"

	if (( verify != SRIOV_NUMVFS )); then
	log_err "SR-IOV VF count mismatch! Expected $SRIOV_NUMVFS, got $verify"
	else
	log "SR-IOV VFs reset successfully."
	fi
	fi

	# Step 4: Start VMs back up (stagger to avoid thundering herd)
	for vmid in "${!VM_IPS[@]}"; do
	log "Starting VM $vmid..."
	qm start "$vmid" 2>&1 \| while read -r line; do log " qm start $vmid: $line"; done
	sleep 10 # stagger starts
	done

	# Step 5: Record reset time and clear failure counters
	date +%s > "$COOLDOWN_FILE"
	for vmid in "${!VM_IPS[@]}"; do
	rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}"
	done

	log "SR-IOV reset complete. VMs restarted. Cooldown active for ${COOLDOWN_SECONDS}s."
	log "============================================================"
No results found