lavacano · April 16, 2026 03:26
diff --git a/sriov-watchdog.service b/sriov-watchdog.service
 [Unit]
 Description=SR-IOV VF Watchdog for BCM57810 SR-IOV VMs
 After=pve-guests.service

 [Service]
 Type=oneshot
 ExecStart=/usr/local/bin/sriov-watchdog.sh
 # Prevent FD/env leaks to child processes
 PrivateTmp=true
 # Kill any leftover children when the script exits (belt + suspenders)
 KillMode=control-group
 # Don't let a hung qm command keep us stuck forever
 TimeoutStartSec=300
 # Log to journal (stdout/stderr captured automatically)
 StandardOutput=journal
 StandardError=journal
 SyslogIdentifier=sriov-watchdog
diff --git a/sriov-watchdog.sh b/sriov-watchdog.sh
 #!/bin/bash
 # ============================================================================
 # SR-IOV VF Watchdog for Proxmox VE
 # ============================================================================
 # Monitors VMs using SR-IOV VFs from a BCM57810 PF for network degradation.
 # When detected, gracefully stops affected VMs, resets SR-IOV VFs, and
 # restarts them.
 #
 # Root cause: bnx2x VF PCIe link can degrade to Width x0, causing transmit
 # timeouts, RCU stalls, and cascading failures inside the VM.
 #
 # Runs as a systemd oneshot service triggered by sriov-watchdog.timer.
 # No lock file needed — systemd prevents concurrent runs.
 #
 # Install:
 #   cp sriov-watchdog.sh   /usr/local/bin/sriov-watchdog.sh
 #   cp sriov-watchdog.service /etc/systemd/system/
 #   cp sriov-watchdog.timer   /etc/systemd/system/
 #   chmod +x /usr/local/bin/sriov-watchdog.sh
 #   mkdir -p /var/lib/sriov-watchdog
 #   systemctl daemon-reload
 #   systemctl enable --now sriov-watchdog.timer
 #
 # Remove old cron entry:
 #   crontab -e  # delete the sriov-watchdog line
 #
 # View logs:
 #   journalctl -u sriov-watchdog --since "1 hour ago"
 #   systemctl status sriov-watchdog.timer
 # ============================================================================

 set -euo pipefail

 export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"

 # ── Configuration ───────────────────────────────────────────────────────────
 declare -A VM_IPS
 VM_IPS[9302]="192.168.40.22"
 VM_IPS[9307]="192.168.40.27"

 PF_PCI="0000:04:00.0"
 SRIOV_NUMVFS=3

 PING_COUNT=3
 PING_TIMEOUT=2
 FAIL_THRESHOLD=4
 FAIL_WINDOW_SECONDS=600
 RECOVERY_THRESHOLD=5
 COOLDOWN_SECONDS=600

 STATE_DIR="/var/lib/sriov-watchdog"
 # ── End Configuration ───────────────────────────────────────────────────────

 log()      { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
 log_warn() { log "WARNING: $*"; }
 log_err()  { log "ERROR: $*"; }

 mkdir -p "$STATE_DIR"

 # ── Cooldown check ──────────────────────────────────────────────────────────
 COOLDOWN_FILE="$STATE_DIR/last_reset"
 if [[ -f "$COOLDOWN_FILE" ]]; then
    last_reset=$(cat "$COOLDOWN_FILE")
    now=$(date +%s)
    elapsed=$(( now - last_reset ))
    if (( elapsed < COOLDOWN_SECONDS )); then
        remaining=$(( COOLDOWN_SECONDS - elapsed ))
        log "In cooldown period (${remaining}s remaining). Skipping."
        exit 0
    fi
 fi

 # ── Check if VMs are running ───────────────────────────────────────────────
 any_running=false
 declare -A vm_running
 for vmid in "${!VM_IPS[@]}"; do
    status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') || status="unknown"
    if [[ "$status" == "running" ]]; then
        vm_running[$vmid]=true
        any_running=true
    else
        vm_running[$vmid]=false
    fi
 done

 if [[ "$any_running" == "false" ]]; then
    log "No monitored VMs are running. Nothing to check."
    for vmid in "${!VM_IPS[@]}"; do
        rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}"
    done
    exit 0
 fi

 # ── Ping check (time-windowed) ──────────────────────────────────────────────
 needs_reset=false
 now=$(date +%s)

 for vmid in "${!VM_IPS[@]}"; do
    ip="${VM_IPS[$vmid]}"
    fail_file="$STATE_DIR/fails_${vmid}"
    recovery_file="$STATE_DIR/recovery_${vmid}"

    if [[ "${vm_running[$vmid]}" != "true" ]]; then
        rm -f "$fail_file" "$recovery_file"
        continue
    fi

    if ping -c "$PING_COUNT" -W "$PING_TIMEOUT" -q "$ip" &>/dev/null; then
        recovery_count=0
        [[ -f "$recovery_file" ]] && recovery_count=$(cat "$recovery_file")
        recovery_count=$(( recovery_count + 1 ))
        echo "$recovery_count" > "$recovery_file"

        if (( recovery_count >= RECOVERY_THRESHOLD )); then
            if [[ -f "$fail_file" ]]; then
                log "VM $vmid ($ip) recovered ($recovery_count consecutive successes). Clearing failure history."
                rm -f "$fail_file"
            fi
            rm -f "$recovery_file"
        else
            recent=0
            if [[ -f "$fail_file" ]]; then
                while read -r ts; do
                    (( now - ts < FAIL_WINDOW_SECONDS )) && (( recent++ ))
                done < "$fail_file"
            fi
            if (( recent > 0 )); then
                log "VM $vmid ($ip) pingable but only $recovery_count/$RECOVERY_THRESHOLD consecutive successes ($recent recent failures still in window)."
            fi
        fi
    else
        echo "$now" >> "$fail_file"
        rm -f "$recovery_file"

        if [[ -f "$fail_file" ]]; then
            tmpfile=$(mktemp)
            while read -r ts; do
                (( now - ts < FAIL_WINDOW_SECONDS )) && echo "$ts"
            done < "$fail_file" > "$tmpfile"
            mv "$tmpfile" "$fail_file"
        fi

        recent_fails=$(wc -l < "$fail_file")
        log_warn "VM $vmid ($ip) unreachable. Failures in last $((FAIL_WINDOW_SECONDS/60))min: $recent_fails/$FAIL_THRESHOLD"

        if (( recent_fails >= FAIL_THRESHOLD )); then
            if dmesg --time-format iso 2>/dev/null | tail -500 | grep -qiE "bnx2x.*04:01|vfio-pci.*04:01.*reset"; then
                log_err "VM $vmid ($ip) hit $recent_fails failures in window AND bnx2x/vfio errors detected in dmesg."
                needs_reset=true
            else
                if (( recent_fails >= FAIL_THRESHOLD + 2 )); then
                    log_err "VM $vmid ($ip) hit $recent_fails failures in window (no bnx2x errors, but triggering reset anyway)."
                    needs_reset=true
                else
                    log_warn "VM $vmid ($ip) unreachable but no bnx2x errors in dmesg. Waiting for more failures."
                fi
            fi
        fi
    fi
 done

 if [[ "$needs_reset" != "true" ]]; then
    # Log a brief all-clear so we know the watchdog is running
    running_vms=()
    for vmid in "${!VM_IPS[@]}"; do
        [[ "${vm_running[$vmid]}" == "true" ]] && running_vms+=("$vmid")
    done
    log "All clear. Monitored VMs healthy: ${running_vms[*]}"
    exit 0
 fi

 # ── SR-IOV Reset Procedure ─────────────────────────────────────────────────
 log "============================================================"
 log "SR-IOV RESET TRIGGERED"
 log "============================================================"

 for vmid in "${!VM_IPS[@]}"; do
    if [[ "${vm_running[$vmid]}" == "true" ]]; then
        log "Stopping VM $vmid..."
        qm stop "$vmid" --timeout 120 2>&1 | while read -r line; do log "  qm stop $vmid: $line"; done

        for i in $(seq 1 30); do
            status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') || status="unknown"
            if [[ "$status" == "stopped" ]]; then
                log "VM $vmid stopped successfully."
                break
            fi
            if (( i == 30 )); then
                log_err "VM $vmid did not stop within 30s after qm stop. Forcing shutdown."
                qm stop "$vmid" --skiplock 2>&1 || true
                sleep 5
            fi
            sleep 1
        done
    fi
 done

 log "Waiting 5s for vfio-pci to release VF devices..."
 sleep 5

 SRIOV_FILE="/sys/bus/pci/devices/${PF_PCI}/sriov_numvfs"
 if [[ ! -f "$SRIOV_FILE" ]]; then
    log_err "SR-IOV numvfs file not found: $SRIOV_FILE"
    log_err "Aborting reset. Starting VMs back up with potentially bad VFs."
 else
    current_vfs=$(cat "$SRIOV_FILE")
    log "Current SR-IOV VFs: $current_vfs"

    log "Disabling SR-IOV VFs (setting to 0)..."
    echo 0 > "$SRIOV_FILE" 2>&1 || log_err "Failed to set sriov_numvfs to 0"
    sleep 3

    verify=$(cat "$SRIOV_FILE")
    log "SR-IOV VFs after disable: $verify"

    log "Re-enabling SR-IOV VFs (setting to $SRIOV_NUMVFS)..."
    echo "$SRIOV_NUMVFS" > "$SRIOV_FILE" 2>&1 || log_err "Failed to set sriov_numvfs to $SRIOV_NUMVFS"
    sleep 5

    verify=$(cat "$SRIOV_FILE")
    log "SR-IOV VFs after re-enable: $verify"

    if (( verify != SRIOV_NUMVFS )); then
        log_err "SR-IOV VF count mismatch! Expected $SRIOV_NUMVFS, got $verify"
    else
        log "SR-IOV VFs reset successfully."
    fi
 fi

 for vmid in "${!VM_IPS[@]}"; do
    log "Starting VM $vmid..."
    qm start "$vmid" 2>&1 | while read -r line; do log "  qm start $vmid: $line"; done
    sleep 10
 done

 date +%s > "$COOLDOWN_FILE"
 for vmid in "${!VM_IPS[@]}"; do
    rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}"
 done

 log "SR-IOV reset complete. VMs restarted. Cooldown active for ${COOLDOWN_SECONDS}s."
 log "============================================================"
diff --git a/sriov-watchdog.timer b/sriov-watchdog.timer
 [Unit]
 Description=SR-IOV VF Watchdog Timer
 After=pve-guests.service

 [Timer]
 # Run every 2 minutes
 OnBootSec=120
 OnUnitActiveSec=120
 # Don't stack up missed runs
 Persistent=false
 # Small jitter to avoid exact-second scheduling conflicts
 RandomizedDelaySec=5

 [Install]
 WantedBy=timers.target
	[Unit]
	Description=SR-IOV VF Watchdog for BCM57810 SR-IOV VMs
	After=pve-guests.service

	[Service]
	Type=oneshot
	ExecStart=/usr/local/bin/sriov-watchdog.sh
	# Prevent FD/env leaks to child processes
	PrivateTmp=true
	# Kill any leftover children when the script exits (belt + suspenders)
	KillMode=control-group
	# Don't let a hung qm command keep us stuck forever
	TimeoutStartSec=300
	# Log to journal (stdout/stderr captured automatically)
	StandardOutput=journal
	StandardError=journal
	SyslogIdentifier=sriov-watchdog
	#!/bin/bash
	# ============================================================================
	# SR-IOV VF Watchdog for Proxmox VE
	# ============================================================================
	# Monitors VMs using SR-IOV VFs from a BCM57810 PF for network degradation.
	# When detected, gracefully stops affected VMs, resets SR-IOV VFs, and
	# restarts them.
	#
	# Root cause: bnx2x VF PCIe link can degrade to Width x0, causing transmit
	# timeouts, RCU stalls, and cascading failures inside the VM.
	#
	# Runs as a systemd oneshot service triggered by sriov-watchdog.timer.
	# No lock file needed — systemd prevents concurrent runs.
	#
	# Install:
	# cp sriov-watchdog.sh /usr/local/bin/sriov-watchdog.sh
	# cp sriov-watchdog.service /etc/systemd/system/
	# cp sriov-watchdog.timer /etc/systemd/system/
	# chmod +x /usr/local/bin/sriov-watchdog.sh
	# mkdir -p /var/lib/sriov-watchdog
	# systemctl daemon-reload
	# systemctl enable --now sriov-watchdog.timer
	#
	# Remove old cron entry:
	# crontab -e # delete the sriov-watchdog line
	#
	# View logs:
	# journalctl -u sriov-watchdog --since "1 hour ago"
	# systemctl status sriov-watchdog.timer
	# ============================================================================

	set -euo pipefail

	export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"

	# ── Configuration ───────────────────────────────────────────────────────────
	declare -A VM_IPS
	VM_IPS[9302]="192.168.40.22"
	VM_IPS[9307]="192.168.40.27"

	PF_PCI="0000:04:00.0"
	SRIOV_NUMVFS=3

	PING_COUNT=3
	PING_TIMEOUT=2
	FAIL_THRESHOLD=4
	FAIL_WINDOW_SECONDS=600
	RECOVERY_THRESHOLD=5
	COOLDOWN_SECONDS=600

	STATE_DIR="/var/lib/sriov-watchdog"
	# ── End Configuration ───────────────────────────────────────────────────────

	log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
	log_warn() { log "WARNING: $*"; }
	log_err() { log "ERROR: $*"; }

	mkdir -p "$STATE_DIR"

	# ── Cooldown check ──────────────────────────────────────────────────────────
	COOLDOWN_FILE="$STATE_DIR/last_reset"
	if [[ -f "$COOLDOWN_FILE" ]]; then
	last_reset=$(cat "$COOLDOWN_FILE")
	now=$(date +%s)
	elapsed=$(( now - last_reset ))
	if (( elapsed < COOLDOWN_SECONDS )); then
	remaining=$(( COOLDOWN_SECONDS - elapsed ))
	log "In cooldown period (${remaining}s remaining). Skipping."
	exit 0
	fi
	fi

	# ── Check if VMs are running ───────────────────────────────────────────────
	any_running=false
	declare -A vm_running
	for vmid in "${!VM_IPS[@]}"; do
	status=$(qm status "$vmid" 2>/dev/null \| awk '{print $2}') \|\| status="unknown"
	if [[ "$status" == "running" ]]; then
	vm_running[$vmid]=true
	any_running=true
	else
	vm_running[$vmid]=false
	fi
	done

	if [[ "$any_running" == "false" ]]; then
	log "No monitored VMs are running. Nothing to check."
	for vmid in "${!VM_IPS[@]}"; do
	rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}"
	done
	exit 0
	fi

	# ── Ping check (time-windowed) ──────────────────────────────────────────────
	needs_reset=false
	now=$(date +%s)

	for vmid in "${!VM_IPS[@]}"; do
	ip="${VM_IPS[$vmid]}"
	fail_file="$STATE_DIR/fails_${vmid}"
	recovery_file="$STATE_DIR/recovery_${vmid}"

	if [[ "${vm_running[$vmid]}" != "true" ]]; then
	rm -f "$fail_file" "$recovery_file"
	continue
	fi

	if ping -c "$PING_COUNT" -W "$PING_TIMEOUT" -q "$ip" &>/dev/null; then
	recovery_count=0
	[[ -f "$recovery_file" ]] && recovery_count=$(cat "$recovery_file")
	recovery_count=$(( recovery_count + 1 ))
	echo "$recovery_count" > "$recovery_file"

	if (( recovery_count >= RECOVERY_THRESHOLD )); then
	if [[ -f "$fail_file" ]]; then
	log "VM $vmid ($ip) recovered ($recovery_count consecutive successes). Clearing failure history."
	rm -f "$fail_file"
	fi
	rm -f "$recovery_file"
	else
	recent=0
	if [[ -f "$fail_file" ]]; then
	while read -r ts; do
	(( now - ts < FAIL_WINDOW_SECONDS )) && (( recent++ ))
	done < "$fail_file"
	fi
	if (( recent > 0 )); then
	log "VM $vmid ($ip) pingable but only $recovery_count/$RECOVERY_THRESHOLD consecutive successes ($recent recent failures still in window)."
	fi
	fi
	else
	echo "$now" >> "$fail_file"
	rm -f "$recovery_file"

	if [[ -f "$fail_file" ]]; then
	tmpfile=$(mktemp)
	while read -r ts; do
	(( now - ts < FAIL_WINDOW_SECONDS )) && echo "$ts"
	done < "$fail_file" > "$tmpfile"
	mv "$tmpfile" "$fail_file"
	fi

	recent_fails=$(wc -l < "$fail_file")
	log_warn "VM $vmid ($ip) unreachable. Failures in last $((FAIL_WINDOW_SECONDS/60))min: $recent_fails/$FAIL_THRESHOLD"

	if (( recent_fails >= FAIL_THRESHOLD )); then
	if dmesg --time-format iso 2>/dev/null \| tail -500 \| grep -qiE "bnx2x.04:01\|vfio-pci.04:01.*reset"; then
	log_err "VM $vmid ($ip) hit $recent_fails failures in window AND bnx2x/vfio errors detected in dmesg."
	needs_reset=true
	else
	if (( recent_fails >= FAIL_THRESHOLD + 2 )); then
	log_err "VM $vmid ($ip) hit $recent_fails failures in window (no bnx2x errors, but triggering reset anyway)."
	needs_reset=true
	else
	log_warn "VM $vmid ($ip) unreachable but no bnx2x errors in dmesg. Waiting for more failures."
	fi
	fi
	fi
	fi
	done

	if [[ "$needs_reset" != "true" ]]; then
	# Log a brief all-clear so we know the watchdog is running
	running_vms=()
	for vmid in "${!VM_IPS[@]}"; do
	[[ "${vm_running[$vmid]}" == "true" ]] && running_vms+=("$vmid")
	done
	log "All clear. Monitored VMs healthy: ${running_vms[*]}"
	exit 0
	fi

	# ── SR-IOV Reset Procedure ─────────────────────────────────────────────────
	log "============================================================"
	log "SR-IOV RESET TRIGGERED"
	log "============================================================"

	for vmid in "${!VM_IPS[@]}"; do
	if [[ "${vm_running[$vmid]}" == "true" ]]; then
	log "Stopping VM $vmid..."
	qm stop "$vmid" --timeout 120 2>&1 \| while read -r line; do log " qm stop $vmid: $line"; done

	for i in $(seq 1 30); do
	status=$(qm status "$vmid" 2>/dev/null \| awk '{print $2}') \|\| status="unknown"
	if [[ "$status" == "stopped" ]]; then
	log "VM $vmid stopped successfully."
	break
	fi
	if (( i == 30 )); then
	log_err "VM $vmid did not stop within 30s after qm stop. Forcing shutdown."
	qm stop "$vmid" --skiplock 2>&1 \|\| true
	sleep 5
	fi
	sleep 1
	done
	fi
	done

	log "Waiting 5s for vfio-pci to release VF devices..."
	sleep 5

	SRIOV_FILE="/sys/bus/pci/devices/${PF_PCI}/sriov_numvfs"
	if [[ ! -f "$SRIOV_FILE" ]]; then
	log_err "SR-IOV numvfs file not found: $SRIOV_FILE"
	log_err "Aborting reset. Starting VMs back up with potentially bad VFs."
	else
	current_vfs=$(cat "$SRIOV_FILE")
	log "Current SR-IOV VFs: $current_vfs"

	log "Disabling SR-IOV VFs (setting to 0)..."
	echo 0 > "$SRIOV_FILE" 2>&1 \|\| log_err "Failed to set sriov_numvfs to 0"
	sleep 3

	verify=$(cat "$SRIOV_FILE")
	log "SR-IOV VFs after disable: $verify"

	log "Re-enabling SR-IOV VFs (setting to $SRIOV_NUMVFS)..."
	echo "$SRIOV_NUMVFS" > "$SRIOV_FILE" 2>&1 \|\| log_err "Failed to set sriov_numvfs to $SRIOV_NUMVFS"
	sleep 5

	verify=$(cat "$SRIOV_FILE")
	log "SR-IOV VFs after re-enable: $verify"

	if (( verify != SRIOV_NUMVFS )); then
	log_err "SR-IOV VF count mismatch! Expected $SRIOV_NUMVFS, got $verify"
	else
	log "SR-IOV VFs reset successfully."
	fi
	fi

	for vmid in "${!VM_IPS[@]}"; do
	log "Starting VM $vmid..."
	qm start "$vmid" 2>&1 \| while read -r line; do log " qm start $vmid: $line"; done
	sleep 10
	done

	date +%s > "$COOLDOWN_FILE"
	for vmid in "${!VM_IPS[@]}"; do
	rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}"
	done

	log "SR-IOV reset complete. VMs restarted. Cooldown active for ${COOLDOWN_SECONDS}s."
	log "============================================================"
	[Unit]
	Description=SR-IOV VF Watchdog Timer
	After=pve-guests.service

	[Timer]
	# Run every 2 minutes
	OnBootSec=120
	OnUnitActiveSec=120
	# Don't stack up missed runs
	Persistent=false
	# Small jitter to avoid exact-second scheduling conflicts
	RandomizedDelaySec=5

	[Install]
	WantedBy=timers.target