Created
April 16, 2026 03:26
-
-
Save lavacano/6e4c936821cc3d35bc31708fa5176c19 to your computer and use it in GitHub Desktop.
sriov-watchdog.sh systemd version
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [Unit] | |
| Description=SR-IOV VF Watchdog for BCM57810 SR-IOV VMs | |
| After=pve-guests.service | |
| [Service] | |
| Type=oneshot | |
| ExecStart=/usr/local/bin/sriov-watchdog.sh | |
| # Prevent FD/env leaks to child processes | |
| PrivateTmp=true | |
| # Kill any leftover children when the script exits (belt + suspenders) | |
| KillMode=control-group | |
| # Don't let a hung qm command keep us stuck forever | |
| TimeoutStartSec=300 | |
| # Log to journal (stdout/stderr captured automatically) | |
| StandardOutput=journal | |
| StandardError=journal | |
| SyslogIdentifier=sriov-watchdog |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # ============================================================================ | |
| # SR-IOV VF Watchdog for Proxmox VE | |
| # ============================================================================ | |
| # Monitors VMs using SR-IOV VFs from a BCM57810 PF for network degradation. | |
| # When detected, gracefully stops affected VMs, resets SR-IOV VFs, and | |
| # restarts them. | |
| # | |
| # Root cause: bnx2x VF PCIe link can degrade to Width x0, causing transmit | |
| # timeouts, RCU stalls, and cascading failures inside the VM. | |
| # | |
| # Runs as a systemd oneshot service triggered by sriov-watchdog.timer. | |
| # No lock file needed — systemd prevents concurrent runs. | |
| # | |
| # Install: | |
| # cp sriov-watchdog.sh /usr/local/bin/sriov-watchdog.sh | |
| # cp sriov-watchdog.service /etc/systemd/system/ | |
| # cp sriov-watchdog.timer /etc/systemd/system/ | |
| # chmod +x /usr/local/bin/sriov-watchdog.sh | |
| # mkdir -p /var/lib/sriov-watchdog | |
| # systemctl daemon-reload | |
| # systemctl enable --now sriov-watchdog.timer | |
| # | |
| # Remove old cron entry: | |
| # crontab -e # delete the sriov-watchdog line | |
| # | |
| # View logs: | |
| # journalctl -u sriov-watchdog --since "1 hour ago" | |
| # systemctl status sriov-watchdog.timer | |
| # ============================================================================ | |
| set -euo pipefail | |
| export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH" | |
| # ── Configuration ─────────────────────────────────────────────────────────── | |
| declare -A VM_IPS | |
| VM_IPS[9302]="192.168.40.22" | |
| VM_IPS[9307]="192.168.40.27" | |
| PF_PCI="0000:04:00.0" | |
| SRIOV_NUMVFS=3 | |
| PING_COUNT=3 | |
| PING_TIMEOUT=2 | |
| FAIL_THRESHOLD=4 | |
| FAIL_WINDOW_SECONDS=600 | |
| RECOVERY_THRESHOLD=5 | |
| COOLDOWN_SECONDS=600 | |
| STATE_DIR="/var/lib/sriov-watchdog" | |
| # ── End Configuration ─────────────────────────────────────────────────────── | |
| log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } | |
| log_warn() { log "WARNING: $*"; } | |
| log_err() { log "ERROR: $*"; } | |
| mkdir -p "$STATE_DIR" | |
| # ── Cooldown check ────────────────────────────────────────────────────────── | |
| COOLDOWN_FILE="$STATE_DIR/last_reset" | |
| if [[ -f "$COOLDOWN_FILE" ]]; then | |
| last_reset=$(cat "$COOLDOWN_FILE") | |
| now=$(date +%s) | |
| elapsed=$(( now - last_reset )) | |
| if (( elapsed < COOLDOWN_SECONDS )); then | |
| remaining=$(( COOLDOWN_SECONDS - elapsed )) | |
| log "In cooldown period (${remaining}s remaining). Skipping." | |
| exit 0 | |
| fi | |
| fi | |
| # ── Check if VMs are running ─────────────────────────────────────────────── | |
| any_running=false | |
| declare -A vm_running | |
| for vmid in "${!VM_IPS[@]}"; do | |
| status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') || status="unknown" | |
| if [[ "$status" == "running" ]]; then | |
| vm_running[$vmid]=true | |
| any_running=true | |
| else | |
| vm_running[$vmid]=false | |
| fi | |
| done | |
| if [[ "$any_running" == "false" ]]; then | |
| log "No monitored VMs are running. Nothing to check." | |
| for vmid in "${!VM_IPS[@]}"; do | |
| rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}" | |
| done | |
| exit 0 | |
| fi | |
| # ── Ping check (time-windowed) ────────────────────────────────────────────── | |
| needs_reset=false | |
| now=$(date +%s) | |
| for vmid in "${!VM_IPS[@]}"; do | |
| ip="${VM_IPS[$vmid]}" | |
| fail_file="$STATE_DIR/fails_${vmid}" | |
| recovery_file="$STATE_DIR/recovery_${vmid}" | |
| if [[ "${vm_running[$vmid]}" != "true" ]]; then | |
| rm -f "$fail_file" "$recovery_file" | |
| continue | |
| fi | |
| if ping -c "$PING_COUNT" -W "$PING_TIMEOUT" -q "$ip" &>/dev/null; then | |
| recovery_count=0 | |
| [[ -f "$recovery_file" ]] && recovery_count=$(cat "$recovery_file") | |
| recovery_count=$(( recovery_count + 1 )) | |
| echo "$recovery_count" > "$recovery_file" | |
| if (( recovery_count >= RECOVERY_THRESHOLD )); then | |
| if [[ -f "$fail_file" ]]; then | |
| log "VM $vmid ($ip) recovered ($recovery_count consecutive successes). Clearing failure history." | |
| rm -f "$fail_file" | |
| fi | |
| rm -f "$recovery_file" | |
| else | |
| recent=0 | |
| if [[ -f "$fail_file" ]]; then | |
| while read -r ts; do | |
| (( now - ts < FAIL_WINDOW_SECONDS )) && (( recent++ )) | |
| done < "$fail_file" | |
| fi | |
| if (( recent > 0 )); then | |
| log "VM $vmid ($ip) pingable but only $recovery_count/$RECOVERY_THRESHOLD consecutive successes ($recent recent failures still in window)." | |
| fi | |
| fi | |
| else | |
| echo "$now" >> "$fail_file" | |
| rm -f "$recovery_file" | |
| if [[ -f "$fail_file" ]]; then | |
| tmpfile=$(mktemp) | |
| while read -r ts; do | |
| (( now - ts < FAIL_WINDOW_SECONDS )) && echo "$ts" | |
| done < "$fail_file" > "$tmpfile" | |
| mv "$tmpfile" "$fail_file" | |
| fi | |
| recent_fails=$(wc -l < "$fail_file") | |
| log_warn "VM $vmid ($ip) unreachable. Failures in last $((FAIL_WINDOW_SECONDS/60))min: $recent_fails/$FAIL_THRESHOLD" | |
| if (( recent_fails >= FAIL_THRESHOLD )); then | |
| if dmesg --time-format iso 2>/dev/null | tail -500 | grep -qiE "bnx2x.*04:01|vfio-pci.*04:01.*reset"; then | |
| log_err "VM $vmid ($ip) hit $recent_fails failures in window AND bnx2x/vfio errors detected in dmesg." | |
| needs_reset=true | |
| else | |
| if (( recent_fails >= FAIL_THRESHOLD + 2 )); then | |
| log_err "VM $vmid ($ip) hit $recent_fails failures in window (no bnx2x errors, but triggering reset anyway)." | |
| needs_reset=true | |
| else | |
| log_warn "VM $vmid ($ip) unreachable but no bnx2x errors in dmesg. Waiting for more failures." | |
| fi | |
| fi | |
| fi | |
| fi | |
| done | |
| if [[ "$needs_reset" != "true" ]]; then | |
| # Log a brief all-clear so we know the watchdog is running | |
| running_vms=() | |
| for vmid in "${!VM_IPS[@]}"; do | |
| [[ "${vm_running[$vmid]}" == "true" ]] && running_vms+=("$vmid") | |
| done | |
| log "All clear. Monitored VMs healthy: ${running_vms[*]}" | |
| exit 0 | |
| fi | |
| # ── SR-IOV Reset Procedure ───────────────────────────────────────────────── | |
| log "============================================================" | |
| log "SR-IOV RESET TRIGGERED" | |
| log "============================================================" | |
| for vmid in "${!VM_IPS[@]}"; do | |
| if [[ "${vm_running[$vmid]}" == "true" ]]; then | |
| log "Stopping VM $vmid..." | |
| qm stop "$vmid" --timeout 120 2>&1 | while read -r line; do log " qm stop $vmid: $line"; done | |
| for i in $(seq 1 30); do | |
| status=$(qm status "$vmid" 2>/dev/null | awk '{print $2}') || status="unknown" | |
| if [[ "$status" == "stopped" ]]; then | |
| log "VM $vmid stopped successfully." | |
| break | |
| fi | |
| if (( i == 30 )); then | |
| log_err "VM $vmid did not stop within 30s after qm stop. Forcing shutdown." | |
| qm stop "$vmid" --skiplock 2>&1 || true | |
| sleep 5 | |
| fi | |
| sleep 1 | |
| done | |
| fi | |
| done | |
| log "Waiting 5s for vfio-pci to release VF devices..." | |
| sleep 5 | |
| SRIOV_FILE="/sys/bus/pci/devices/${PF_PCI}/sriov_numvfs" | |
| if [[ ! -f "$SRIOV_FILE" ]]; then | |
| log_err "SR-IOV numvfs file not found: $SRIOV_FILE" | |
| log_err "Aborting reset. Starting VMs back up with potentially bad VFs." | |
| else | |
| current_vfs=$(cat "$SRIOV_FILE") | |
| log "Current SR-IOV VFs: $current_vfs" | |
| log "Disabling SR-IOV VFs (setting to 0)..." | |
| echo 0 > "$SRIOV_FILE" 2>&1 || log_err "Failed to set sriov_numvfs to 0" | |
| sleep 3 | |
| verify=$(cat "$SRIOV_FILE") | |
| log "SR-IOV VFs after disable: $verify" | |
| log "Re-enabling SR-IOV VFs (setting to $SRIOV_NUMVFS)..." | |
| echo "$SRIOV_NUMVFS" > "$SRIOV_FILE" 2>&1 || log_err "Failed to set sriov_numvfs to $SRIOV_NUMVFS" | |
| sleep 5 | |
| verify=$(cat "$SRIOV_FILE") | |
| log "SR-IOV VFs after re-enable: $verify" | |
| if (( verify != SRIOV_NUMVFS )); then | |
| log_err "SR-IOV VF count mismatch! Expected $SRIOV_NUMVFS, got $verify" | |
| else | |
| log "SR-IOV VFs reset successfully." | |
| fi | |
| fi | |
| for vmid in "${!VM_IPS[@]}"; do | |
| log "Starting VM $vmid..." | |
| qm start "$vmid" 2>&1 | while read -r line; do log " qm start $vmid: $line"; done | |
| sleep 10 | |
| done | |
| date +%s > "$COOLDOWN_FILE" | |
| for vmid in "${!VM_IPS[@]}"; do | |
| rm -f "$STATE_DIR/fails_${vmid}" "$STATE_DIR/recovery_${vmid}" | |
| done | |
| log "SR-IOV reset complete. VMs restarted. Cooldown active for ${COOLDOWN_SECONDS}s." | |
| log "============================================================" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [Unit] | |
| Description=SR-IOV VF Watchdog Timer | |
| After=pve-guests.service | |
| [Timer] | |
| # Run every 2 minutes | |
| OnBootSec=120 | |
| OnUnitActiveSec=120 | |
| # Don't stack up missed runs | |
| Persistent=false | |
| # Small jitter to avoid exact-second scheduling conflicts | |
| RandomizedDelaySec=5 | |
| [Install] | |
| WantedBy=timers.target |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment