Last active
April 16, 2026 03:24
-
-
Save lavacano/d0665abe2dab07a765df67206d53f774 to your computer and use it in GitHub Desktop.
SR-IOV VF Watchdog for Proxmox VE (bnx2x)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # ============================================================================ | |
| # SR-IOV VF Watchdog for Proxmox VE | |
| # ============================================================================ | |
| # Monitors VMs using SR-IOV VFs from a BCM57810 PF for network degradation. | |
| # When detected, gracefully stops affected VMs, resets SR-IOV VFs, and | |
| # restarts them. | |
| # | |
| # Root cause: bnx2x VF PCIe link can degrade to Width x0, causing transmit | |
| # timeouts, RCU stalls, and cascading failures inside the VM. | |
| # | |
| # Install: | |
| # cp sriov-watchdog.sh /usr/local/bin/sriov-watchdog.sh | |
| # chmod +x /usr/local/bin/sriov-watchdog.sh | |
| # mkdir -p /var/lib/sriov-watchdog | |
| # | |
| # Cron (every minute): | |
| # * * * * * /usr/local/bin/sriov-watchdog.sh >> /var/log/sriov-watchdog.log 2>&1 | |
| # ============================================================================ | |
| set -euo pipefail | |
| # Ensure full PATH for cron (qm is in /usr/sbin) | |
| export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH" | |
| # ── Configuration ─────────────────────────────────────────────────────────── | |
| # VMs to monitor (must ALL be stopped before SR-IOV reset) | |
| declare -A VM_IPS | |
| VM_IPS[9302]="192.168.40.22" | |
| VM_IPS[9307]="192.168.40.27" # <-- UPDATE THIS to 9307's actual IP | |
| # SR-IOV Physical Function PCI address | |
| PF_PCI="0000:04:00.0" | |
| SRIOV_NUMVFS=3 | |
| # Detection tuning | |
| PING_COUNT=3 # pings per check | |
| PING_TIMEOUT=2 # seconds per ping | |
| FAIL_THRESHOLD=4 # failures within window before action | |
| FAIL_WINDOW_SECONDS=600 # 10 min sliding window for failures | |
| RECOVERY_THRESHOLD=5 # consecutive successes to clear failure history | |
| COOLDOWN_SECONDS=600 # 10 min cooldown after a reset (avoid loops) | |
| # Paths | |
| STATE_DIR="/var/lib/sriov-watchdog" | |
| LOCK_FILE="/var/run/sriov-watchdog.lock" | |
| # ── End Configuration ─────────────────────────────────────────────────────── | |
| log() { | |
| echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | |
| } | |
| log_warn() { | |
| log "WARNING: $*" | |
| } | |
| log_err() { | |
| log "ERROR: $*" | |
| } | |
| # Ensure state directory exists | |
| mkdir -p "$STATE_DIR" | |
| # ── Lock (prevent concurrent runs) ───────────────────────────────────────── | |
| exec 200>"$LOCK_FILE" | |
| if ! flock -n 200; then | |
| log "Another instance is running. Exiting." | |
| exit 0 | |
| fi | |
| # ── Cooldown check ────────────────────────────────────────────────────────── | |
| COOLDOWN_FILE="$STATE_DIR/last_reset" | |
| if [[ -f "$COOLDOWN_FILE" ]]; then | |
| last_reset=$(cat "$COOLDOWN_FILE") | |
| now=$(date +%s) | |
| elapsed=$(( now - last_reset )) | |
| if (( elapsed < COOLDOWN_SECONDS )); then | |
| remaining=$(( COOLDOWN_SECONDS - elapsed )) | |
| log "In cooldown period (${remaining}s remaining). Skipping." | |
| exit 0 | |
| fi | |
| fi | |
| # ── Check if VMs are running ─────────────────────────────────────────────── | |
| any_running=false | |
| declare -A vm_running | |
| for vmid in "${!VM_IPS[@]}"; do | |
| status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') || status="unknown" | |
| if [[ "$status" == "running" ]]; then | |
| vm_running[$vmid]=true | |
| any_running=true | |
| else | |
| vm_running[$vmid]=false | |
| fi | |
| done | |
| if [[ "$any_running" == "false" ]]; then | |
| log "No monitored VMs are running. Nothing to check." | |
| # Clear any failure counters since VMs are down | |
| for vmid in "${!VM_IPS[@]}"; do | |
| rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}" | |
| done | |
| exit 0 | |
| fi | |
| # ── Ping check (time-windowed) ────────────────────────────────────────────── | |
| needs_reset=false | |
| now=$(date +%s) | |
| for vmid in "${!VM_IPS[@]}"; do | |
| ip="${VM_IPS[$vmid]}" | |
| fail_file="$STATE_DIR/fails_${vmid}" # one timestamp per line | |
| recovery_file="$STATE_DIR/recovery_${vmid}" # consecutive success count | |
| # Skip VMs that aren't running | |
| if [[ "${vm_running[$vmid]}" != "true" ]]; then | |
| rm -f "$fail_file" "$recovery_file" | |
| continue | |
| fi | |
| # Ping the VM | |
| if ping -c "$PING_COUNT" -W "$PING_TIMEOUT" -q "$ip" &>/dev/null; then | |
| # VM is reachable — increment recovery counter | |
| recovery_count=0 | |
| [[ -f "$recovery_file" ]] && recovery_count=$(cat "$recovery_file") | |
| recovery_count=$(( recovery_count + 1 )) | |
| echo "$recovery_count" > "$recovery_file" | |
| if (( recovery_count >= RECOVERY_THRESHOLD )); then | |
| # Sustained recovery — clear failure history | |
| if [[ -f "$fail_file" ]]; then | |
| log "VM $vmid ($ip) recovered ($recovery_count consecutive successes). Clearing failure history." | |
| rm -f "$fail_file" | |
| fi | |
| rm -f "$recovery_file" | |
| else | |
| # Count recent failures still in window | |
| recent=0 | |
| if [[ -f "$fail_file" ]]; then | |
| while read -r ts; do | |
| (( now - ts < FAIL_WINDOW_SECONDS )) && (( recent++ )) | |
| done < "$fail_file" | |
| fi | |
| if (( recent > 0 )); then | |
| log "VM $vmid ($ip) pingable but only $recovery_count/$RECOVERY_THRESHOLD consecutive successes ($recent recent failures still in window)." | |
| fi | |
| fi | |
| else | |
| # VM is unreachable — record failure timestamp, reset recovery counter | |
| echo "$now" >> "$fail_file" | |
| rm -f "$recovery_file" | |
| # Prune entries outside the window | |
| if [[ -f "$fail_file" ]]; then | |
| tmpfile=$(mktemp) | |
| while read -r ts; do | |
| (( now - ts < FAIL_WINDOW_SECONDS )) && echo "$ts" | |
| done < "$fail_file" > "$tmpfile" | |
| mv "$tmpfile" "$fail_file" | |
| fi | |
| # Count recent failures | |
| recent_fails=$(wc -l < "$fail_file") | |
| log_warn "VM $vmid ($ip) unreachable. Failures in last $((FAIL_WINDOW_SECONDS/60))min: $recent_fails/$FAIL_THRESHOLD" | |
| if (( recent_fails >= FAIL_THRESHOLD )); then | |
| # Double-check: look for bnx2x errors in host dmesg | |
| if dmesg --time-format iso 2>/dev/null | tail -500 | grep -qiE "bnx2x.*04:01|vfio-pci.*04:01.*reset"; then | |
| log_err "VM $vmid ($ip) hit $recent_fails failures in window AND bnx2x/vfio errors detected in dmesg." | |
| needs_reset=true | |
| else | |
| if (( recent_fails >= FAIL_THRESHOLD + 2 )); then | |
| log_err "VM $vmid ($ip) hit $recent_fails failures in window (no bnx2x errors, but triggering reset anyway)." | |
| needs_reset=true | |
| else | |
| log_warn "VM $vmid ($ip) unreachable but no bnx2x errors in dmesg. Waiting for more failures." | |
| fi | |
| fi | |
| fi | |
| fi | |
| done | |
| if [[ "$needs_reset" != "true" ]]; then | |
| exit 0 | |
| fi | |
| # ── SR-IOV Reset Procedure ───────────────────────────────────────────────── | |
| log "============================================================" | |
| log "SR-IOV RESET TRIGGERED" | |
| log "============================================================" | |
| # Step 1: Stop all monitored VMs (must stop ALL before touching SR-IOV) | |
| for vmid in "${!VM_IPS[@]}"; do | |
| if [[ "${vm_running[$vmid]}" == "true" ]]; then | |
| log "Stopping VM $vmid..." | |
| qm stop "$vmid" --timeout 120 2>&1 | while read -r line; do log " qm stop $vmid: $line"; done | |
| # Wait for VM to fully stop | |
| for i in $(seq 1 30); do | |
| status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') || status="unknown" | |
| if [[ "$status" == "stopped" ]]; then | |
| log "VM $vmid stopped successfully." | |
| break | |
| fi | |
| if (( i == 30 )); then | |
| log_err "VM $vmid did not stop within 30s after qm stop. Forcing shutdown." | |
| qm stop "$vmid" --skiplock 2>&1 || true | |
| sleep 5 | |
| fi | |
| sleep 1 | |
| done | |
| fi | |
| done | |
| # Step 2: Wait for VFIO to release devices | |
| log "Waiting 5s for vfio-pci to release VF devices..." | |
| sleep 5 | |
| # Step 3: Reset SR-IOV VFs | |
| SRIOV_FILE="/sys/bus/pci/devices/${PF_PCI}/sriov_numvfs" | |
| if [[ ! -f "$SRIOV_FILE" ]]; then | |
| log_err "SR-IOV numvfs file not found: $SRIOV_FILE" | |
| log_err "Aborting reset. Starting VMs back up with potentially bad VFs." | |
| else | |
| current_vfs=$(cat "$SRIOV_FILE") | |
| log "Current SR-IOV VFs: $current_vfs" | |
| log "Disabling SR-IOV VFs (setting to 0)..." | |
| echo 0 > "$SRIOV_FILE" 2>&1 || log_err "Failed to set sriov_numvfs to 0" | |
| sleep 3 | |
| verify=$(cat "$SRIOV_FILE") | |
| log "SR-IOV VFs after disable: $verify" | |
| log "Re-enabling SR-IOV VFs (setting to $SRIOV_NUMVFS)..." | |
| echo "$SRIOV_NUMVFS" > "$SRIOV_FILE" 2>&1 || log_err "Failed to set sriov_numvfs to $SRIOV_NUMVFS" | |
| sleep 5 | |
| verify=$(cat "$SRIOV_FILE") | |
| log "SR-IOV VFs after re-enable: $verify" | |
| if (( verify != SRIOV_NUMVFS )); then | |
| log_err "SR-IOV VF count mismatch! Expected $SRIOV_NUMVFS, got $verify" | |
| else | |
| log "SR-IOV VFs reset successfully." | |
| fi | |
| fi | |
| # Step 4: Start VMs back up (stagger to avoid thundering herd) | |
| for vmid in "${!VM_IPS[@]}"; do | |
| log "Starting VM $vmid..." | |
| qm start "$vmid" 2>&1 | while read -r line; do log " qm start $vmid: $line"; done | |
| sleep 10 # stagger starts | |
| done | |
| # Step 5: Record reset time and clear failure counters | |
| date +%s > "$COOLDOWN_FILE" | |
| for vmid in "${!VM_IPS[@]}"; do | |
| rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}" | |
| done | |
| log "SR-IOV reset complete. VMs restarted. Cooldown active for ${COOLDOWN_SECONDS}s." | |
| log "============================================================" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment