Skip to content

Instantly share code, notes, and snippets.

@zuedev
Last active April 19, 2026 20:57
Show Gist options
  • Select an option

  • Save zuedev/6319958012aed5a959c7726febd29bac to your computer and use it in GitHub Desktop.

Select an option

Save zuedev/6319958012aed5a959c7726febd29bac to your computer and use it in GitHub Desktop.
Bash health check script for an Intel i9-9900K / 128GB DDR4-2666 / RAID 0 NVMe system running Debian 13.
#!/usr/bin/env bash
# =============================================================================
# System Health Check — i9-9900K / 128GB DDR4-2666 / RAID 0 NVMe
# =============================================================================
# Usage:
# chmod +x healthcheck.sh
# sudo ./healthcheck.sh # Full check
# sudo ./healthcheck.sh --quick # Skip fio storage test
# sudo ./healthcheck.sh --fix # Attempt to fix issues found
# =============================================================================
set -euo pipefail
# --- Thresholds (tuned to this system's known-good baseline) -----------------
readonly CPU_EVENTS_MIN=5000 # 7z single-thread MIPS floor
readonly CPU_MIPS_MIN=4000 # 7z single-thread MIPS floor
readonly CPU_MULTITHREAD_MIN=50000 # 7z multi-thread MIPS floor
readonly CPU_FREQ_MIN_MHZ=3500 # Minimum acceptable boost freq
readonly CPU_TEMP_MAX=85 # °C — throttle risk above this
readonly MEM_SIZE_EXPECTED_GB=128 # Expected total RAM
readonly MEM_BW_MIN_MBPS=15000 # mbw DUMB method floor (MB/s)
readonly STORAGE_SEQ_READ_MIN=5000 # MB/s floor for RAID 0
readonly STORAGE_SEQ_WRITE_MIN=4000 # MB/s floor for RAID 0
readonly STORAGE_RAND_READ_MIN=600000 # IOPS floor (4K random)
readonly STORAGE_RAND_WRITE_MIN=500000
readonly RAID_DEVICE="/dev/md2"
readonly FIO_TESTFILE="/fio-healthcheck-tmp"
# --- Colours -----------------------------------------------------------------
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
CYAN='\033[0;36m'; BOLD='\033[1m'; RESET='\033[0m'
PASS="${GREEN}[PASS]${RESET}"
FAIL="${RED}[FAIL]${RESET}"
WARN="${YELLOW}[WARN]${RESET}"
INFO="${CYAN}[INFO]${RESET}"
QUICK=false
FIX=false
FAILURES=0
WARNINGS=0
for arg in "$@"; do
case $arg in
--quick) QUICK=true ;;
--fix) FIX=true ;;
esac
done
# =============================================================================
# Helpers
# =============================================================================
require_root() {
if [[ $EUID -ne 0 ]]; then
echo -e "${RED}Error:${RESET} This script must be run as root (sudo)."
exit 1
fi
}
check_deps() {
local missing=()
for cmd in 7z mbw fio mdadm dmidecode; do
command -v "$cmd" &>/dev/null || missing+=("$cmd")
done
if [[ ${#missing[@]} -gt 0 ]]; then
echo -e "${WARN} Missing tools: ${missing[*]}"
echo -e " Install with: apt install ${missing[*]}"
echo ""
fi
}
header() {
echo ""
echo -e "${BOLD}${CYAN}══════════════════════════════════════════════${RESET}"
echo -e "${BOLD}${CYAN} $1${RESET}"
echo -e "${BOLD}${CYAN}══════════════════════════════════════════════${RESET}"
}
result() {
local status=$1 label=$2 value=$3 expected=$4
printf " %-35s %s\n" "$label:" "$value"
if [[ $status == "pass" ]]; then
echo -e " ${PASS} (expected: ${expected})"
elif [[ $status == "warn" ]]; then
echo -e " ${WARN} (expected: ${expected})"
((WARNINGS++)) || true
else
echo -e " ${FAIL} (expected: ${expected})"
((FAILURES++)) || true
fi
echo ""
}
# =============================================================================
# CPU Checks
# =============================================================================
check_cpu() {
header "CPU — Intel i9-9900K"
# --- Governor / EPP check --------------------------------------------------
local governor epp
governor=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor 2>/dev/null || echo "unknown")
epp=$(cat /sys/devices/system/cpu/cpu0/cpufreq/energy_performance_preference 2>/dev/null || echo "unknown")
echo -e " ${INFO} Governor: ${governor} | EPP: ${epp}"
if [[ $governor != "performance" || $epp != "performance" ]]; then
echo -e " ${WARN} CPU not in full performance mode — scores may be low"
if $FIX; then
echo -e " ${INFO} --fix: Setting performance governor and EPP..."
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor > /dev/null
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/energy_performance_preference > /dev/null
echo -e " ${INFO} Done. Verify with: cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor"
fi
((WARNINGS++)) || true
else
echo -e " ${PASS} Governor and EPP both set to performance"
fi
echo ""
# --- Current frequency -----------------------------------------------------
local cur_freq_khz cur_freq_mhz
cur_freq_khz=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq 2>/dev/null || echo "0")
cur_freq_mhz=$((cur_freq_khz / 1000))
echo -e " ${INFO} Current frequency (cpu0): ${cur_freq_mhz} MHz"
echo ""
# --- Temperature -----------------------------------------------------------
local max_temp=0
for zone in /sys/class/thermal/thermal_zone*/temp; do
local t
t=$(cat "$zone" 2>/dev/null || echo "0")
t=$((t / 1000))
(( t > max_temp )) && max_temp=$t
done
if (( max_temp >= CPU_TEMP_MAX )); then
result "fail" "Max thermal zone temp" "${max_temp}°C" "< ${CPU_TEMP_MAX}°C"
elif (( max_temp >= 75 )); then
result "warn" "Max thermal zone temp" "${max_temp}°C" "< ${CPU_TEMP_MAX}°C (warm)"
else
result "pass" "Max thermal zone temp" "${max_temp}°C" "< ${CPU_TEMP_MAX}°C"
fi
# --- Turbo / pstate --------------------------------------------------------
local no_turbo max_perf
no_turbo=$(cat /sys/devices/system/cpu/intel_pstate/no_turbo 2>/dev/null || echo "unknown")
max_perf=$(cat /sys/devices/system/cpu/intel_pstate/max_perf_pct 2>/dev/null || echo "unknown")
if [[ $no_turbo == "1" ]]; then
result "warn" "Intel Turbo Boost" "DISABLED" "Enabled (no_turbo=0)"
else
result "pass" "Intel Turbo Boost" "Enabled" "Enabled (no_turbo=0)"
fi
if [[ $max_perf != "unknown" && $max_perf -lt 100 ]]; then
result "warn" "intel_pstate max_perf_pct" "${max_perf}%" "100%"
else
result "pass" "intel_pstate max_perf_pct" "${max_perf}%" "100%"
fi
# --- 7-zip benchmark -------------------------------------------------------
if ! command -v 7z &>/dev/null; then
echo -e " ${WARN} 7z not found — skipping CPU benchmark (apt install p7zip-full)"
echo ""
return
fi
echo -e " ${INFO} Running 7-zip benchmark (this takes ~30s)..."
local z7_output
z7_output=$(7z b 2>/dev/null)
# Extract single-thread and multi-thread Tot ratings
local tot_line mips_st mips_mt
tot_line=$(echo "$z7_output" | grep "^Tot:")
mips_st=$(echo "$tot_line" | awk '{print $3}')
mips_mt=$(echo "$tot_line" | awk '{print $4}')
# Parse 1T freq from 7z output
local freq_1t
freq_1t=$(echo "$z7_output" | grep "1T CPU Freq" | grep -oP '\d+' | sort -n | tail -1)
if [[ -n $freq_1t ]]; then
if (( freq_1t >= CPU_FREQ_MIN_MHZ )); then
result "pass" "1T boost frequency (7z)" "${freq_1t} MHz" ">= ${CPU_FREQ_MIN_MHZ} MHz"
else
result "fail" "1T boost frequency (7z)" "${freq_1t} MHz" ">= ${CPU_FREQ_MIN_MHZ} MHz"
fi
fi
if [[ -n $mips_st ]]; then
if (( mips_st >= CPU_MIPS_MIN )); then
result "pass" "Single-thread MIPS (7z)" "${mips_st}" ">= ${CPU_MIPS_MIN}"
else
result "fail" "Single-thread MIPS (7z)" "${mips_st}" ">= ${CPU_MIPS_MIN}"
fi
fi
if [[ -n $mips_mt ]]; then
if (( mips_mt >= CPU_MULTITHREAD_MIN )); then
result "pass" "Multi-thread MIPS (7z)" "${mips_mt}" ">= ${CPU_MULTITHREAD_MIN}"
else
result "fail" "Multi-thread MIPS (7z)" "${mips_mt}" ">= ${CPU_MULTITHREAD_MIN}"
fi
fi
}
# =============================================================================
# Memory Checks
# =============================================================================
check_memory() {
header "Memory — 4× 32GB DDR4-2666"
# --- Size check ------------------------------------------------------------
local mem_total_kb mem_total_gb
mem_total_kb=$(grep MemTotal /proc/meminfo | awk '{print $2}')
mem_total_gb=$(( mem_total_kb / 1024 / 1024 ))
if (( mem_total_gb >= MEM_SIZE_EXPECTED_GB - 2 )); then
result "pass" "Total RAM" "${mem_total_gb} GB" "~${MEM_SIZE_EXPECTED_GB} GB"
else
result "fail" "Total RAM" "${mem_total_gb} GB" "~${MEM_SIZE_EXPECTED_GB} GB"
fi
# --- DIMM population / speed -----------------------------------------------
if command -v dmidecode &>/dev/null; then
local dimm_count speed_list
dimm_count=$(dmidecode -t memory 2>/dev/null | grep -c "Size: [0-9]" || echo "unknown")
speed_list=$(dmidecode -t memory 2>/dev/null | grep "Configured Memory Speed:" | grep -v "Unknown" | awk '{print $4}' | sort -u | tr '\n' ' ')
echo -e " ${INFO} DIMMs populated: ${dimm_count} | Speed(s): ${speed_list}MT/s"
# Check dual-channel (expect 2 DIMMs per channel)
local ch_a ch_b
ch_a=$(dmidecode -t memory 2>/dev/null | grep "Locator: ChannelA" | wc -l)
ch_b=$(dmidecode -t memory 2>/dev/null | grep "Locator: ChannelB" | wc -l)
if (( ch_a > 0 && ch_b > 0 )); then
result "pass" "Dual-channel population" "Channel A: ${ch_a}, Channel B: ${ch_b}" "Both channels populated"
else
result "warn" "Dual-channel population" "Channel A: ${ch_a}, Channel B: ${ch_b}" "Both channels should be populated"
fi
fi
# --- Bandwidth test --------------------------------------------------------
if ! command -v mbw &>/dev/null; then
echo -e " ${WARN} mbw not found — skipping memory bandwidth test (apt install mbw)"
echo ""
return
fi
echo -e " ${INFO} Running mbw bandwidth test (1024 MB)..."
local mbw_output dumb_avg
mbw_output=$(mbw 1024 2>/dev/null)
dumb_avg=$(echo "$mbw_output" | grep "AVG.*DUMB" | grep -oP 'Copy:\s+\K[0-9.]+')
if [[ -n $dumb_avg ]]; then
local dumb_int=${dumb_avg%.*}
if (( dumb_int >= MEM_BW_MIN_MBPS )); then
result "pass" "Memory bandwidth (DUMB)" "${dumb_avg} MB/s" ">= ${MEM_BW_MIN_MBPS} MB/s"
else
result "fail" "Memory bandwidth (DUMB)" "${dumb_avg} MB/s" ">= ${MEM_BW_MIN_MBPS} MB/s"
fi
fi
# --- ECC / errors ----------------------------------------------------------
local edac_dirs
edac_dirs=$(ls /sys/devices/system/edac/mc/ 2>/dev/null | wc -l)
if (( edac_dirs > 0 )); then
local ue_total=0 ce_total=0
for mc in /sys/devices/system/edac/mc/mc*/; do
local ue ce
ue=$(cat "${mc}ue_count" 2>/dev/null || echo 0)
ce=$(cat "${mc}ce_count" 2>/dev/null || echo 0)
(( ue_total += ue )) || true
(( ce_total += ce )) || true
done
if (( ue_total > 0 )); then
result "fail" "Memory uncorrectable errors" "${ue_total}" "0"
elif (( ce_total > 0 )); then
result "warn" "Memory correctable errors" "${ce_total}" "0 (monitor closely)"
else
result "pass" "Memory ECC errors" "None" "0"
fi
else
echo -e " ${INFO} EDAC not available — no ECC error reporting on this platform"
echo ""
fi
}
# =============================================================================
# Storage Checks
# =============================================================================
check_storage() {
header "Storage — RAID 0 (2× Toshiba KXG6 NVMe)"
# --- RAID health -----------------------------------------------------------
if ! command -v mdadm &>/dev/null; then
echo -e " ${WARN} mdadm not found — skipping RAID health check"
else
local md2_state failed_devices
md2_state=$(mdadm --detail "$RAID_DEVICE" 2>/dev/null | grep "State :" | awk '{print $3}')
failed_devices=$(mdadm --detail "$RAID_DEVICE" 2>/dev/null | grep "Failed Devices :" | awk '{print $4}')
if [[ $md2_state == "clean" ]]; then
result "pass" "RAID array state (md2)" "$md2_state" "clean"
else
result "fail" "RAID array state (md2)" "$md2_state" "clean"
fi
if [[ $failed_devices == "0" ]]; then
result "pass" "RAID failed devices" "$failed_devices" "0"
else
result "fail" "RAID failed devices" "$failed_devices" "0 — REPLACE DRIVE IMMEDIATELY"
fi
fi
# --- NVMe drive health via /proc/diskstats errors --------------------------
for drive in nvme0n1 nvme1n1; do
if [[ -b /dev/$drive ]]; then
echo -e " ${INFO} Drive /dev/${drive} present and accessible"
else
echo -e " ${FAIL} Drive /dev/${drive} not found!"
((FAILURES++)) || true
fi
done
echo ""
# --- Disk space ------------------------------------------------------------
local used_pct
used_pct=$(df / | awk 'NR==2{print $5}' | tr -d '%')
if (( used_pct >= 90 )); then
result "fail" "Root filesystem usage" "${used_pct}%" "< 90%"
elif (( used_pct >= 75 )); then
result "warn" "Root filesystem usage" "${used_pct}%" "< 75% recommended"
else
result "pass" "Root filesystem usage" "${used_pct}%" "< 75%"
fi
# --- fio benchmark (skipped in --quick mode) -------------------------------
if $QUICK; then
echo -e " ${INFO} Storage benchmark skipped (--quick mode)"
echo ""
return
fi
if ! command -v fio &>/dev/null; then
echo -e " ${WARN} fio not found — skipping storage benchmark (apt install fio)"
echo ""
return
fi
# Check free space (need at least 6GB for test file)
local free_kb free_gb
free_kb=$(df / | awk 'NR==2{print $4}')
free_gb=$(( free_kb / 1024 / 1024 ))
if (( free_gb < 6 )); then
echo -e " ${WARN} Less than 6GB free — skipping fio test"
echo ""
return
fi
echo -e " ${INFO} Running fio sequential read (30s)..."
local seq_read_bw
seq_read_bw=$(fio --name=seq-read --rw=read --bs=1M --size=4G \
--numjobs=1 --iodepth=8 --runtime=30 --time_based \
--ioengine=libaio --direct=1 --filename="$FIO_TESTFILE" \
--output-format=json 2>/dev/null \
| python3 -c "import sys,json; d=json.load(sys.stdin); print(int(d['jobs'][0]['read']['bw']/1024))")
if (( seq_read_bw >= STORAGE_SEQ_READ_MIN )); then
result "pass" "Sequential read" "${seq_read_bw} MB/s" ">= ${STORAGE_SEQ_READ_MIN} MB/s"
else
result "fail" "Sequential read" "${seq_read_bw} MB/s" ">= ${STORAGE_SEQ_READ_MIN} MB/s"
fi
echo -e " ${INFO} Running fio sequential write (30s)..."
local seq_write_bw
seq_write_bw=$(fio --name=seq-write --rw=write --bs=1M --size=4G \
--numjobs=1 --iodepth=8 --runtime=30 --time_based \
--ioengine=libaio --direct=1 --filename="$FIO_TESTFILE" \
--output-format=json 2>/dev/null \
| python3 -c "import sys,json; d=json.load(sys.stdin); print(int(d['jobs'][0]['write']['bw']/1024))")
if (( seq_write_bw >= STORAGE_SEQ_WRITE_MIN )); then
result "pass" "Sequential write" "${seq_write_bw} MB/s" ">= ${STORAGE_SEQ_WRITE_MIN} MB/s"
else
result "fail" "Sequential write" "${seq_write_bw} MB/s" ">= ${STORAGE_SEQ_WRITE_MIN} MB/s"
fi
echo -e " ${INFO} Running fio random 4K read (30s)..."
local rand_read_iops
rand_read_iops=$(fio --name=rand-read --rw=randread --bs=4k --size=4G \
--numjobs=4 --iodepth=32 --runtime=30 --time_based \
--ioengine=libaio --direct=1 --filename="$FIO_TESTFILE" \
--output-format=json 2>/dev/null \
| python3 -c "import sys,json; d=json.load(sys.stdin); print(int(sum(j['read']['iops'] for j in d['jobs'])))")
if (( rand_read_iops >= STORAGE_RAND_READ_MIN )); then
result "pass" "Random 4K read IOPS" "${rand_read_iops}" ">= ${STORAGE_RAND_READ_MIN}"
else
result "fail" "Random 4K read IOPS" "${rand_read_iops}" ">= ${STORAGE_RAND_READ_MIN}"
fi
echo -e " ${INFO} Running fio random 4K write (30s)..."
local rand_write_iops
rand_write_iops=$(fio --name=rand-write --rw=randwrite --bs=4k --size=4G \
--numjobs=4 --iodepth=32 --runtime=30 --time_based \
--ioengine=libaio --direct=1 --filename="$FIO_TESTFILE" \
--output-format=json 2>/dev/null \
| python3 -c "import sys,json; d=json.load(sys.stdin); print(int(sum(j['write']['iops'] for j in d['jobs'])))")
if (( rand_write_iops >= STORAGE_RAND_WRITE_MIN )); then
result "pass" "Random 4K write IOPS" "${rand_write_iops}" ">= ${STORAGE_RAND_WRITE_MIN}"
else
result "fail" "Random 4K write IOPS" "${rand_write_iops}" ">= ${STORAGE_RAND_WRITE_MIN}"
fi
rm -f "$FIO_TESTFILE"
}
# =============================================================================
# Summary
# =============================================================================
print_summary() {
header "Summary"
local timestamp
timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo -e " ${INFO} Completed at: ${timestamp}"
echo ""
if (( FAILURES > 0 )); then
echo -e " ${RED}${BOLD}RESULT: ${FAILURES} failure(s), ${WARNINGS} warning(s)${RESET}"
echo -e " ${RED}System requires attention.${RESET}"
elif (( WARNINGS > 0 )); then
echo -e " ${YELLOW}${BOLD}RESULT: 0 failures, ${WARNINGS} warning(s)${RESET}"
echo -e " ${YELLOW}System is functional but review warnings above.${RESET}"
else
echo -e " ${GREEN}${BOLD}RESULT: All checks passed.${RESET}"
echo -e " ${GREEN}System is healthy.${RESET}"
fi
echo ""
# Optionally log to file
local logfile="/var/log/healthcheck.log"
echo "[${timestamp}] Failures: ${FAILURES} Warnings: ${WARNINGS}" >> "$logfile" 2>/dev/null || true
echo -e " ${INFO} Result appended to ${logfile}"
echo ""
}
# =============================================================================
# Main
# =============================================================================
require_root
echo ""
echo -e "${BOLD}System Health Check${RESET}"
echo -e "Host: $(hostname) | Kernel: $(uname -r) | $(date)"
$QUICK && echo -e "${INFO} Quick mode — storage benchmark skipped"
$FIX && echo -e "${INFO} Fix mode — will attempt to correct issues"
echo ""
check_deps
check_cpu
check_memory
check_storage
print_summary
exit $FAILURES
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment