Skip to content

Instantly share code, notes, and snippets.

@lavacano
Last active April 16, 2026 03:24
Show Gist options
  • Select an option

  • Save lavacano/d0665abe2dab07a765df67206d53f774 to your computer and use it in GitHub Desktop.

Select an option

Save lavacano/d0665abe2dab07a765df67206d53f774 to your computer and use it in GitHub Desktop.
SR-IOV VF Watchdog for Proxmox VE (bnx2x)
#!/bin/bash
# ============================================================================
# SR-IOV VF Watchdog for Proxmox VE
# ============================================================================
# Monitors VMs using SR-IOV VFs from a BCM57810 PF for network degradation.
# When detected, gracefully stops affected VMs, resets SR-IOV VFs, and
# restarts them.
#
# Root cause: bnx2x VF PCIe link can degrade to Width x0, causing transmit
# timeouts, RCU stalls, and cascading failures inside the VM.
#
# Install:
# cp sriov-watchdog.sh /usr/local/bin/sriov-watchdog.sh
# chmod +x /usr/local/bin/sriov-watchdog.sh
# mkdir -p /var/lib/sriov-watchdog
#
# Cron (every minute):
# * * * * * /usr/local/bin/sriov-watchdog.sh >> /var/log/sriov-watchdog.log 2>&1
# ============================================================================
set -euo pipefail
# Ensure full PATH for cron (qm is in /usr/sbin)
export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"
# ── Configuration ───────────────────────────────────────────────────────────
# VMs to monitor (must ALL be stopped before SR-IOV reset)
declare -A VM_IPS
VM_IPS[9302]="192.168.40.22"
VM_IPS[9307]="192.168.40.27" # <-- UPDATE THIS to 9307's actual IP
# SR-IOV Physical Function PCI address
PF_PCI="0000:04:00.0"
SRIOV_NUMVFS=3
# Detection tuning
PING_COUNT=3 # pings per check
PING_TIMEOUT=2 # seconds per ping
FAIL_THRESHOLD=4 # failures within window before action
FAIL_WINDOW_SECONDS=600 # 10 min sliding window for failures
RECOVERY_THRESHOLD=5 # consecutive successes to clear failure history
COOLDOWN_SECONDS=600 # 10 min cooldown after a reset (avoid loops)
# Paths
STATE_DIR="/var/lib/sriov-watchdog"
LOCK_FILE="/var/run/sriov-watchdog.lock"
# ── End Configuration ───────────────────────────────────────────────────────
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
}
log_warn() {
log "WARNING: $*"
}
log_err() {
log "ERROR: $*"
}
# Ensure state directory exists
mkdir -p "$STATE_DIR"
# ── Lock (prevent concurrent runs) ─────────────────────────────────────────
exec 200>"$LOCK_FILE"
if ! flock -n 200; then
log "Another instance is running. Exiting."
exit 0
fi
# ── Cooldown check ──────────────────────────────────────────────────────────
COOLDOWN_FILE="$STATE_DIR/last_reset"
if [[ -f "$COOLDOWN_FILE" ]]; then
last_reset=$(cat "$COOLDOWN_FILE")
now=$(date +%s)
elapsed=$(( now - last_reset ))
if (( elapsed < COOLDOWN_SECONDS )); then
remaining=$(( COOLDOWN_SECONDS - elapsed ))
log "In cooldown period (${remaining}s remaining). Skipping."
exit 0
fi
fi
# ── Check if VMs are running ───────────────────────────────────────────────
any_running=false
declare -A vm_running
for vmid in "${!VM_IPS[@]}"; do
status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') || status="unknown"
if [[ "$status" == "running" ]]; then
vm_running[$vmid]=true
any_running=true
else
vm_running[$vmid]=false
fi
done
if [[ "$any_running" == "false" ]]; then
log "No monitored VMs are running. Nothing to check."
# Clear any failure counters since VMs are down
for vmid in "${!VM_IPS[@]}"; do
rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}"
done
exit 0
fi
# ── Ping check (time-windowed) ──────────────────────────────────────────────
needs_reset=false
now=$(date +%s)
for vmid in "${!VM_IPS[@]}"; do
ip="${VM_IPS[$vmid]}"
fail_file="$STATE_DIR/fails_${vmid}" # one timestamp per line
recovery_file="$STATE_DIR/recovery_${vmid}" # consecutive success count
# Skip VMs that aren't running
if [[ "${vm_running[$vmid]}" != "true" ]]; then
rm -f "$fail_file" "$recovery_file"
continue
fi
# Ping the VM
if ping -c "$PING_COUNT" -W "$PING_TIMEOUT" -q "$ip" &>/dev/null; then
# VM is reachable — increment recovery counter
recovery_count=0
[[ -f "$recovery_file" ]] && recovery_count=$(cat "$recovery_file")
recovery_count=$(( recovery_count + 1 ))
echo "$recovery_count" > "$recovery_file"
if (( recovery_count >= RECOVERY_THRESHOLD )); then
# Sustained recovery — clear failure history
if [[ -f "$fail_file" ]]; then
log "VM $vmid ($ip) recovered ($recovery_count consecutive successes). Clearing failure history."
rm -f "$fail_file"
fi
rm -f "$recovery_file"
else
# Count recent failures still in window
recent=0
if [[ -f "$fail_file" ]]; then
while read -r ts; do
(( now - ts < FAIL_WINDOW_SECONDS )) && (( recent++ ))
done < "$fail_file"
fi
if (( recent > 0 )); then
log "VM $vmid ($ip) pingable but only $recovery_count/$RECOVERY_THRESHOLD consecutive successes ($recent recent failures still in window)."
fi
fi
else
# VM is unreachable — record failure timestamp, reset recovery counter
echo "$now" >> "$fail_file"
rm -f "$recovery_file"
# Prune entries outside the window
if [[ -f "$fail_file" ]]; then
tmpfile=$(mktemp)
while read -r ts; do
(( now - ts < FAIL_WINDOW_SECONDS )) && echo "$ts"
done < "$fail_file" > "$tmpfile"
mv "$tmpfile" "$fail_file"
fi
# Count recent failures
recent_fails=$(wc -l < "$fail_file")
log_warn "VM $vmid ($ip) unreachable. Failures in last $((FAIL_WINDOW_SECONDS/60))min: $recent_fails/$FAIL_THRESHOLD"
if (( recent_fails >= FAIL_THRESHOLD )); then
# Double-check: look for bnx2x errors in host dmesg
if dmesg --time-format iso 2>/dev/null | tail -500 | grep -qiE "bnx2x.*04:01|vfio-pci.*04:01.*reset"; then
log_err "VM $vmid ($ip) hit $recent_fails failures in window AND bnx2x/vfio errors detected in dmesg."
needs_reset=true
else
if (( recent_fails >= FAIL_THRESHOLD + 2 )); then
log_err "VM $vmid ($ip) hit $recent_fails failures in window (no bnx2x errors, but triggering reset anyway)."
needs_reset=true
else
log_warn "VM $vmid ($ip) unreachable but no bnx2x errors in dmesg. Waiting for more failures."
fi
fi
fi
fi
done
if [[ "$needs_reset" != "true" ]]; then
exit 0
fi
# ── SR-IOV Reset Procedure ─────────────────────────────────────────────────
log "============================================================"
log "SR-IOV RESET TRIGGERED"
log "============================================================"
# Step 1: Stop all monitored VMs (must stop ALL before touching SR-IOV)
for vmid in "${!VM_IPS[@]}"; do
if [[ "${vm_running[$vmid]}" == "true" ]]; then
log "Stopping VM $vmid..."
qm stop "$vmid" --timeout 120 2>&1 | while read -r line; do log " qm stop $vmid: $line"; done
# Wait for VM to fully stop
for i in $(seq 1 30); do
status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') || status="unknown"
if [[ "$status" == "stopped" ]]; then
log "VM $vmid stopped successfully."
break
fi
if (( i == 30 )); then
log_err "VM $vmid did not stop within 30s after qm stop. Forcing shutdown."
qm stop "$vmid" --skiplock 2>&1 || true
sleep 5
fi
sleep 1
done
fi
done
# Step 2: Wait for VFIO to release devices
log "Waiting 5s for vfio-pci to release VF devices..."
sleep 5
# Step 3: Reset SR-IOV VFs
SRIOV_FILE="/sys/bus/pci/devices/${PF_PCI}/sriov_numvfs"
if [[ ! -f "$SRIOV_FILE" ]]; then
log_err "SR-IOV numvfs file not found: $SRIOV_FILE"
log_err "Aborting reset. Starting VMs back up with potentially bad VFs."
else
current_vfs=$(cat "$SRIOV_FILE")
log "Current SR-IOV VFs: $current_vfs"
log "Disabling SR-IOV VFs (setting to 0)..."
echo 0 > "$SRIOV_FILE" 2>&1 || log_err "Failed to set sriov_numvfs to 0"
sleep 3
verify=$(cat "$SRIOV_FILE")
log "SR-IOV VFs after disable: $verify"
log "Re-enabling SR-IOV VFs (setting to $SRIOV_NUMVFS)..."
echo "$SRIOV_NUMVFS" > "$SRIOV_FILE" 2>&1 || log_err "Failed to set sriov_numvfs to $SRIOV_NUMVFS"
sleep 5
verify=$(cat "$SRIOV_FILE")
log "SR-IOV VFs after re-enable: $verify"
if (( verify != SRIOV_NUMVFS )); then
log_err "SR-IOV VF count mismatch! Expected $SRIOV_NUMVFS, got $verify"
else
log "SR-IOV VFs reset successfully."
fi
fi
# Step 4: Start VMs back up (stagger to avoid thundering herd)
for vmid in "${!VM_IPS[@]}"; do
log "Starting VM $vmid..."
qm start "$vmid" 2>&1 | while read -r line; do log " qm start $vmid: $line"; done
sleep 10 # stagger starts
done
# Step 5: Record reset time and clear failure counters
date +%s > "$COOLDOWN_FILE"
for vmid in "${!VM_IPS[@]}"; do
rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}"
done
log "SR-IOV reset complete. VMs restarted. Cooldown active for ${COOLDOWN_SECONDS}s."
log "============================================================"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment