Skip to content

Instantly share code, notes, and snippets.

@lavacano
Created April 16, 2026 03:26
Show Gist options
  • Select an option

  • Save lavacano/6e4c936821cc3d35bc31708fa5176c19 to your computer and use it in GitHub Desktop.

Select an option

Save lavacano/6e4c936821cc3d35bc31708fa5176c19 to your computer and use it in GitHub Desktop.
sriov-watchdog.sh systemd version
[Unit]
Description=SR-IOV VF Watchdog for BCM57810 SR-IOV VMs
After=pve-guests.service
[Service]
Type=oneshot
ExecStart=/usr/local/bin/sriov-watchdog.sh
# Prevent FD/env leaks to child processes
PrivateTmp=true
# Kill any leftover children when the script exits (belt + suspenders)
KillMode=control-group
# Don't let a hung qm command keep us stuck forever
TimeoutStartSec=300
# Log to journal (stdout/stderr captured automatically)
StandardOutput=journal
StandardError=journal
SyslogIdentifier=sriov-watchdog
#!/bin/bash
# ============================================================================
# SR-IOV VF Watchdog for Proxmox VE
# ============================================================================
# Monitors VMs using SR-IOV VFs from a BCM57810 PF for network degradation.
# When detected, gracefully stops affected VMs, resets SR-IOV VFs, and
# restarts them.
#
# Root cause: bnx2x VF PCIe link can degrade to Width x0, causing transmit
# timeouts, RCU stalls, and cascading failures inside the VM.
#
# Runs as a systemd oneshot service triggered by sriov-watchdog.timer.
# No lock file needed — systemd prevents concurrent runs.
#
# Install:
# cp sriov-watchdog.sh /usr/local/bin/sriov-watchdog.sh
# cp sriov-watchdog.service /etc/systemd/system/
# cp sriov-watchdog.timer /etc/systemd/system/
# chmod +x /usr/local/bin/sriov-watchdog.sh
# mkdir -p /var/lib/sriov-watchdog
# systemctl daemon-reload
# systemctl enable --now sriov-watchdog.timer
#
# Remove old cron entry:
# crontab -e # delete the sriov-watchdog line
#
# View logs:
# journalctl -u sriov-watchdog --since "1 hour ago"
# systemctl status sriov-watchdog.timer
# ============================================================================
set -euo pipefail
export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH"
# ── Configuration ───────────────────────────────────────────────────────────
declare -A VM_IPS
VM_IPS[9302]="192.168.40.22"
VM_IPS[9307]="192.168.40.27"
PF_PCI="0000:04:00.0"
SRIOV_NUMVFS=3
PING_COUNT=3
PING_TIMEOUT=2
FAIL_THRESHOLD=4
FAIL_WINDOW_SECONDS=600
RECOVERY_THRESHOLD=5
COOLDOWN_SECONDS=600
STATE_DIR="/var/lib/sriov-watchdog"
# ── End Configuration ───────────────────────────────────────────────────────
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
log_warn() { log "WARNING: $*"; }
log_err() { log "ERROR: $*"; }
mkdir -p "$STATE_DIR"
# ── Cooldown check ──────────────────────────────────────────────────────────
COOLDOWN_FILE="$STATE_DIR/last_reset"
if [[ -f "$COOLDOWN_FILE" ]]; then
last_reset=$(cat "$COOLDOWN_FILE")
now=$(date +%s)
elapsed=$(( now - last_reset ))
if (( elapsed < COOLDOWN_SECONDS )); then
remaining=$(( COOLDOWN_SECONDS - elapsed ))
log "In cooldown period (${remaining}s remaining). Skipping."
exit 0
fi
fi
# ── Check if VMs are running ───────────────────────────────────────────────
any_running=false
declare -A vm_running
for vmid in "${!VM_IPS[@]}"; do
status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') || status="unknown"
if [[ "$status" == "running" ]]; then
vm_running[$vmid]=true
any_running=true
else
vm_running[$vmid]=false
fi
done
if [[ "$any_running" == "false" ]]; then
log "No monitored VMs are running. Nothing to check."
for vmid in "${!VM_IPS[@]}"; do
rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}"
done
exit 0
fi
# ── Ping check (time-windowed) ──────────────────────────────────────────────
needs_reset=false
now=$(date +%s)
for vmid in "${!VM_IPS[@]}"; do
ip="${VM_IPS[$vmid]}"
fail_file="$STATE_DIR/fails_${vmid}"
recovery_file="$STATE_DIR/recovery_${vmid}"
if [[ "${vm_running[$vmid]}" != "true" ]]; then
rm -f "$fail_file" "$recovery_file"
continue
fi
if ping -c "$PING_COUNT" -W "$PING_TIMEOUT" -q "$ip" &>/dev/null; then
recovery_count=0
[[ -f "$recovery_file" ]] && recovery_count=$(cat "$recovery_file")
recovery_count=$(( recovery_count + 1 ))
echo "$recovery_count" > "$recovery_file"
if (( recovery_count >= RECOVERY_THRESHOLD )); then
if [[ -f "$fail_file" ]]; then
log "VM $vmid ($ip) recovered ($recovery_count consecutive successes). Clearing failure history."
rm -f "$fail_file"
fi
rm -f "$recovery_file"
else
recent=0
if [[ -f "$fail_file" ]]; then
while read -r ts; do
(( now - ts < FAIL_WINDOW_SECONDS )) && (( recent++ ))
done < "$fail_file"
fi
if (( recent > 0 )); then
log "VM $vmid ($ip) pingable but only $recovery_count/$RECOVERY_THRESHOLD consecutive successes ($recent recent failures still in window)."
fi
fi
else
echo "$now" >> "$fail_file"
rm -f "$recovery_file"
if [[ -f "$fail_file" ]]; then
tmpfile=$(mktemp)
while read -r ts; do
(( now - ts < FAIL_WINDOW_SECONDS )) && echo "$ts"
done < "$fail_file" > "$tmpfile"
mv "$tmpfile" "$fail_file"
fi
recent_fails=$(wc -l < "$fail_file")
log_warn "VM $vmid ($ip) unreachable. Failures in last $((FAIL_WINDOW_SECONDS/60))min: $recent_fails/$FAIL_THRESHOLD"
if (( recent_fails >= FAIL_THRESHOLD )); then
if dmesg --time-format iso 2>/dev/null | tail -500 | grep -qiE "bnx2x.*04:01|vfio-pci.*04:01.*reset"; then
log_err "VM $vmid ($ip) hit $recent_fails failures in window AND bnx2x/vfio errors detected in dmesg."
needs_reset=true
else
if (( recent_fails >= FAIL_THRESHOLD + 2 )); then
log_err "VM $vmid ($ip) hit $recent_fails failures in window (no bnx2x errors, but triggering reset anyway)."
needs_reset=true
else
log_warn "VM $vmid ($ip) unreachable but no bnx2x errors in dmesg. Waiting for more failures."
fi
fi
fi
fi
done
if [[ "$needs_reset" != "true" ]]; then
# Log a brief all-clear so we know the watchdog is running
running_vms=()
for vmid in "${!VM_IPS[@]}"; do
[[ "${vm_running[$vmid]}" == "true" ]] && running_vms+=("$vmid")
done
log "All clear. Monitored VMs healthy: ${running_vms[*]}"
exit 0
fi
# ── SR-IOV Reset Procedure ─────────────────────────────────────────────────
log "============================================================"
log "SR-IOV RESET TRIGGERED"
log "============================================================"
for vmid in "${!VM_IPS[@]}"; do
if [[ "${vm_running[$vmid]}" == "true" ]]; then
log "Stopping VM $vmid..."
qm stop "$vmid" --timeout 120 2>&1 | while read -r line; do log " qm stop $vmid: $line"; done
for i in $(seq 1 30); do
status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') || status="unknown"
if [[ "$status" == "stopped" ]]; then
log "VM $vmid stopped successfully."
break
fi
if (( i == 30 )); then
log_err "VM $vmid did not stop within 30s after qm stop. Forcing shutdown."
qm stop "$vmid" --skiplock 2>&1 || true
sleep 5
fi
sleep 1
done
fi
done
log "Waiting 5s for vfio-pci to release VF devices..."
sleep 5
SRIOV_FILE="/sys/bus/pci/devices/${PF_PCI}/sriov_numvfs"
if [[ ! -f "$SRIOV_FILE" ]]; then
log_err "SR-IOV numvfs file not found: $SRIOV_FILE"
log_err "Aborting reset. Starting VMs back up with potentially bad VFs."
else
current_vfs=$(cat "$SRIOV_FILE")
log "Current SR-IOV VFs: $current_vfs"
log "Disabling SR-IOV VFs (setting to 0)..."
echo 0 > "$SRIOV_FILE" 2>&1 || log_err "Failed to set sriov_numvfs to 0"
sleep 3
verify=$(cat "$SRIOV_FILE")
log "SR-IOV VFs after disable: $verify"
log "Re-enabling SR-IOV VFs (setting to $SRIOV_NUMVFS)..."
echo "$SRIOV_NUMVFS" > "$SRIOV_FILE" 2>&1 || log_err "Failed to set sriov_numvfs to $SRIOV_NUMVFS"
sleep 5
verify=$(cat "$SRIOV_FILE")
log "SR-IOV VFs after re-enable: $verify"
if (( verify != SRIOV_NUMVFS )); then
log_err "SR-IOV VF count mismatch! Expected $SRIOV_NUMVFS, got $verify"
else
log "SR-IOV VFs reset successfully."
fi
fi
for vmid in "${!VM_IPS[@]}"; do
log "Starting VM $vmid..."
qm start "$vmid" 2>&1 | while read -r line; do log " qm start $vmid: $line"; done
sleep 10
done
date +%s > "$COOLDOWN_FILE"
for vmid in "${!VM_IPS[@]}"; do
rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}"
done
log "SR-IOV reset complete. VMs restarted. Cooldown active for ${COOLDOWN_SECONDS}s."
log "============================================================"
[Unit]
Description=SR-IOV VF Watchdog Timer
After=pve-guests.service
[Timer]
# Run every 2 minutes
OnBootSec=120
OnUnitActiveSec=120
# Don't stack up missed runs
Persistent=false
# Small jitter to avoid exact-second scheduling conflicts
RandomizedDelaySec=5
[Install]
WantedBy=timers.target
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment