Last active
April 19, 2026 20:57
-
-
Save zuedev/6319958012aed5a959c7726febd29bac to your computer and use it in GitHub Desktop.
Bash health check script for an Intel i9-9900K / 128GB DDR4-2666 / RAID 0 NVMe system running Debian 13.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # ============================================================================= | |
| # System Health Check — i9-9900K / 128GB DDR4-2666 / RAID 0 NVMe | |
| # ============================================================================= | |
| # Usage: | |
| # chmod +x healthcheck.sh | |
| # sudo ./healthcheck.sh # Full check | |
| # sudo ./healthcheck.sh --quick # Skip fio storage test | |
| # sudo ./healthcheck.sh --fix # Attempt to fix issues found | |
| # ============================================================================= | |
| set -euo pipefail | |
| # --- Thresholds (tuned to this system's known-good baseline) ----------------- | |
| readonly CPU_EVENTS_MIN=5000 # 7z single-thread MIPS floor | |
| readonly CPU_MIPS_MIN=4000 # 7z single-thread MIPS floor | |
| readonly CPU_MULTITHREAD_MIN=50000 # 7z multi-thread MIPS floor | |
| readonly CPU_FREQ_MIN_MHZ=3500 # Minimum acceptable boost freq | |
| readonly CPU_TEMP_MAX=85 # °C — throttle risk above this | |
| readonly MEM_SIZE_EXPECTED_GB=128 # Expected total RAM | |
| readonly MEM_BW_MIN_MBPS=15000 # mbw DUMB method floor (MB/s) | |
| readonly STORAGE_SEQ_READ_MIN=5000 # MB/s floor for RAID 0 | |
| readonly STORAGE_SEQ_WRITE_MIN=4000 # MB/s floor for RAID 0 | |
| readonly STORAGE_RAND_READ_MIN=600000 # IOPS floor (4K random) | |
| readonly STORAGE_RAND_WRITE_MIN=500000 | |
| readonly RAID_DEVICE="/dev/md2" | |
| readonly FIO_TESTFILE="/fio-healthcheck-tmp" | |
| # --- Colours ----------------------------------------------------------------- | |
| RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m' | |
| CYAN='\033[0;36m'; BOLD='\033[1m'; RESET='\033[0m' | |
| PASS="${GREEN}[PASS]${RESET}" | |
| FAIL="${RED}[FAIL]${RESET}" | |
| WARN="${YELLOW}[WARN]${RESET}" | |
| INFO="${CYAN}[INFO]${RESET}" | |
| QUICK=false | |
| FIX=false | |
| FAILURES=0 | |
| WARNINGS=0 | |
| for arg in "$@"; do | |
| case $arg in | |
| --quick) QUICK=true ;; | |
| --fix) FIX=true ;; | |
| esac | |
| done | |
| # ============================================================================= | |
| # Helpers | |
| # ============================================================================= | |
| require_root() { | |
| if [[ $EUID -ne 0 ]]; then | |
| echo -e "${RED}Error:${RESET} This script must be run as root (sudo)." | |
| exit 1 | |
| fi | |
| } | |
| check_deps() { | |
| local missing=() | |
| for cmd in 7z mbw fio mdadm dmidecode; do | |
| command -v "$cmd" &>/dev/null || missing+=("$cmd") | |
| done | |
| if [[ ${#missing[@]} -gt 0 ]]; then | |
| echo -e "${WARN} Missing tools: ${missing[*]}" | |
| echo -e " Install with: apt install ${missing[*]}" | |
| echo "" | |
| fi | |
| } | |
| header() { | |
| echo "" | |
| echo -e "${BOLD}${CYAN}══════════════════════════════════════════════${RESET}" | |
| echo -e "${BOLD}${CYAN} $1${RESET}" | |
| echo -e "${BOLD}${CYAN}══════════════════════════════════════════════${RESET}" | |
| } | |
| result() { | |
| local status=$1 label=$2 value=$3 expected=$4 | |
| printf " %-35s %s\n" "$label:" "$value" | |
| if [[ $status == "pass" ]]; then | |
| echo -e " ${PASS} (expected: ${expected})" | |
| elif [[ $status == "warn" ]]; then | |
| echo -e " ${WARN} (expected: ${expected})" | |
| ((WARNINGS++)) || true | |
| else | |
| echo -e " ${FAIL} (expected: ${expected})" | |
| ((FAILURES++)) || true | |
| fi | |
| echo "" | |
| } | |
| # ============================================================================= | |
| # CPU Checks | |
| # ============================================================================= | |
| check_cpu() { | |
| header "CPU — Intel i9-9900K" | |
| # --- Governor / EPP check -------------------------------------------------- | |
| local governor epp | |
| governor=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor 2>/dev/null || echo "unknown") | |
| epp=$(cat /sys/devices/system/cpu/cpu0/cpufreq/energy_performance_preference 2>/dev/null || echo "unknown") | |
| echo -e " ${INFO} Governor: ${governor} | EPP: ${epp}" | |
| if [[ $governor != "performance" || $epp != "performance" ]]; then | |
| echo -e " ${WARN} CPU not in full performance mode — scores may be low" | |
| if $FIX; then | |
| echo -e " ${INFO} --fix: Setting performance governor and EPP..." | |
| echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor > /dev/null | |
| echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/energy_performance_preference > /dev/null | |
| echo -e " ${INFO} Done. Verify with: cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor" | |
| fi | |
| ((WARNINGS++)) || true | |
| else | |
| echo -e " ${PASS} Governor and EPP both set to performance" | |
| fi | |
| echo "" | |
| # --- Current frequency ----------------------------------------------------- | |
| local cur_freq_khz cur_freq_mhz | |
| cur_freq_khz=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq 2>/dev/null || echo "0") | |
| cur_freq_mhz=$((cur_freq_khz / 1000)) | |
| echo -e " ${INFO} Current frequency (cpu0): ${cur_freq_mhz} MHz" | |
| echo "" | |
| # --- Temperature ----------------------------------------------------------- | |
| local max_temp=0 | |
| for zone in /sys/class/thermal/thermal_zone*/temp; do | |
| local t | |
| t=$(cat "$zone" 2>/dev/null || echo "0") | |
| t=$((t / 1000)) | |
| (( t > max_temp )) && max_temp=$t | |
| done | |
| if (( max_temp >= CPU_TEMP_MAX )); then | |
| result "fail" "Max thermal zone temp" "${max_temp}°C" "< ${CPU_TEMP_MAX}°C" | |
| elif (( max_temp >= 75 )); then | |
| result "warn" "Max thermal zone temp" "${max_temp}°C" "< ${CPU_TEMP_MAX}°C (warm)" | |
| else | |
| result "pass" "Max thermal zone temp" "${max_temp}°C" "< ${CPU_TEMP_MAX}°C" | |
| fi | |
| # --- Turbo / pstate -------------------------------------------------------- | |
| local no_turbo max_perf | |
| no_turbo=$(cat /sys/devices/system/cpu/intel_pstate/no_turbo 2>/dev/null || echo "unknown") | |
| max_perf=$(cat /sys/devices/system/cpu/intel_pstate/max_perf_pct 2>/dev/null || echo "unknown") | |
| if [[ $no_turbo == "1" ]]; then | |
| result "warn" "Intel Turbo Boost" "DISABLED" "Enabled (no_turbo=0)" | |
| else | |
| result "pass" "Intel Turbo Boost" "Enabled" "Enabled (no_turbo=0)" | |
| fi | |
| if [[ $max_perf != "unknown" && $max_perf -lt 100 ]]; then | |
| result "warn" "intel_pstate max_perf_pct" "${max_perf}%" "100%" | |
| else | |
| result "pass" "intel_pstate max_perf_pct" "${max_perf}%" "100%" | |
| fi | |
| # --- 7-zip benchmark ------------------------------------------------------- | |
| if ! command -v 7z &>/dev/null; then | |
| echo -e " ${WARN} 7z not found — skipping CPU benchmark (apt install p7zip-full)" | |
| echo "" | |
| return | |
| fi | |
| echo -e " ${INFO} Running 7-zip benchmark (this takes ~30s)..." | |
| local z7_output | |
| z7_output=$(7z b 2>/dev/null) | |
| # Extract single-thread and multi-thread Tot ratings | |
| local tot_line mips_st mips_mt | |
| tot_line=$(echo "$z7_output" | grep "^Tot:") | |
| mips_st=$(echo "$tot_line" | awk '{print $3}') | |
| mips_mt=$(echo "$tot_line" | awk '{print $4}') | |
| # Parse 1T freq from 7z output | |
| local freq_1t | |
| freq_1t=$(echo "$z7_output" | grep "1T CPU Freq" | grep -oP '\d+' | sort -n | tail -1) | |
| if [[ -n $freq_1t ]]; then | |
| if (( freq_1t >= CPU_FREQ_MIN_MHZ )); then | |
| result "pass" "1T boost frequency (7z)" "${freq_1t} MHz" ">= ${CPU_FREQ_MIN_MHZ} MHz" | |
| else | |
| result "fail" "1T boost frequency (7z)" "${freq_1t} MHz" ">= ${CPU_FREQ_MIN_MHZ} MHz" | |
| fi | |
| fi | |
| if [[ -n $mips_st ]]; then | |
| if (( mips_st >= CPU_MIPS_MIN )); then | |
| result "pass" "Single-thread MIPS (7z)" "${mips_st}" ">= ${CPU_MIPS_MIN}" | |
| else | |
| result "fail" "Single-thread MIPS (7z)" "${mips_st}" ">= ${CPU_MIPS_MIN}" | |
| fi | |
| fi | |
| if [[ -n $mips_mt ]]; then | |
| if (( mips_mt >= CPU_MULTITHREAD_MIN )); then | |
| result "pass" "Multi-thread MIPS (7z)" "${mips_mt}" ">= ${CPU_MULTITHREAD_MIN}" | |
| else | |
| result "fail" "Multi-thread MIPS (7z)" "${mips_mt}" ">= ${CPU_MULTITHREAD_MIN}" | |
| fi | |
| fi | |
| } | |
| # ============================================================================= | |
| # Memory Checks | |
| # ============================================================================= | |
| check_memory() { | |
| header "Memory — 4× 32GB DDR4-2666" | |
| # --- Size check ------------------------------------------------------------ | |
| local mem_total_kb mem_total_gb | |
| mem_total_kb=$(grep MemTotal /proc/meminfo | awk '{print $2}') | |
| mem_total_gb=$(( mem_total_kb / 1024 / 1024 )) | |
| if (( mem_total_gb >= MEM_SIZE_EXPECTED_GB - 2 )); then | |
| result "pass" "Total RAM" "${mem_total_gb} GB" "~${MEM_SIZE_EXPECTED_GB} GB" | |
| else | |
| result "fail" "Total RAM" "${mem_total_gb} GB" "~${MEM_SIZE_EXPECTED_GB} GB" | |
| fi | |
| # --- DIMM population / speed ----------------------------------------------- | |
| if command -v dmidecode &>/dev/null; then | |
| local dimm_count speed_list | |
| dimm_count=$(dmidecode -t memory 2>/dev/null | grep -c "Size: [0-9]" || echo "unknown") | |
| speed_list=$(dmidecode -t memory 2>/dev/null | grep "Configured Memory Speed:" | grep -v "Unknown" | awk '{print $4}' | sort -u | tr '\n' ' ') | |
| echo -e " ${INFO} DIMMs populated: ${dimm_count} | Speed(s): ${speed_list}MT/s" | |
| # Check dual-channel (expect 2 DIMMs per channel) | |
| local ch_a ch_b | |
| ch_a=$(dmidecode -t memory 2>/dev/null | grep "Locator: ChannelA" | wc -l) | |
| ch_b=$(dmidecode -t memory 2>/dev/null | grep "Locator: ChannelB" | wc -l) | |
| if (( ch_a > 0 && ch_b > 0 )); then | |
| result "pass" "Dual-channel population" "Channel A: ${ch_a}, Channel B: ${ch_b}" "Both channels populated" | |
| else | |
| result "warn" "Dual-channel population" "Channel A: ${ch_a}, Channel B: ${ch_b}" "Both channels should be populated" | |
| fi | |
| fi | |
| # --- Bandwidth test -------------------------------------------------------- | |
| if ! command -v mbw &>/dev/null; then | |
| echo -e " ${WARN} mbw not found — skipping memory bandwidth test (apt install mbw)" | |
| echo "" | |
| return | |
| fi | |
| echo -e " ${INFO} Running mbw bandwidth test (1024 MB)..." | |
| local mbw_output dumb_avg | |
| mbw_output=$(mbw 1024 2>/dev/null) | |
| dumb_avg=$(echo "$mbw_output" | grep "AVG.*DUMB" | grep -oP 'Copy:\s+\K[0-9.]+') | |
| if [[ -n $dumb_avg ]]; then | |
| local dumb_int=${dumb_avg%.*} | |
| if (( dumb_int >= MEM_BW_MIN_MBPS )); then | |
| result "pass" "Memory bandwidth (DUMB)" "${dumb_avg} MB/s" ">= ${MEM_BW_MIN_MBPS} MB/s" | |
| else | |
| result "fail" "Memory bandwidth (DUMB)" "${dumb_avg} MB/s" ">= ${MEM_BW_MIN_MBPS} MB/s" | |
| fi | |
| fi | |
| # --- ECC / errors ---------------------------------------------------------- | |
| local edac_dirs | |
| edac_dirs=$(ls /sys/devices/system/edac/mc/ 2>/dev/null | wc -l) | |
| if (( edac_dirs > 0 )); then | |
| local ue_total=0 ce_total=0 | |
| for mc in /sys/devices/system/edac/mc/mc*/; do | |
| local ue ce | |
| ue=$(cat "${mc}ue_count" 2>/dev/null || echo 0) | |
| ce=$(cat "${mc}ce_count" 2>/dev/null || echo 0) | |
| (( ue_total += ue )) || true | |
| (( ce_total += ce )) || true | |
| done | |
| if (( ue_total > 0 )); then | |
| result "fail" "Memory uncorrectable errors" "${ue_total}" "0" | |
| elif (( ce_total > 0 )); then | |
| result "warn" "Memory correctable errors" "${ce_total}" "0 (monitor closely)" | |
| else | |
| result "pass" "Memory ECC errors" "None" "0" | |
| fi | |
| else | |
| echo -e " ${INFO} EDAC not available — no ECC error reporting on this platform" | |
| echo "" | |
| fi | |
| } | |
| # ============================================================================= | |
| # Storage Checks | |
| # ============================================================================= | |
| check_storage() { | |
| header "Storage — RAID 0 (2× Toshiba KXG6 NVMe)" | |
| # --- RAID health ----------------------------------------------------------- | |
| if ! command -v mdadm &>/dev/null; then | |
| echo -e " ${WARN} mdadm not found — skipping RAID health check" | |
| else | |
| local md2_state failed_devices | |
| md2_state=$(mdadm --detail "$RAID_DEVICE" 2>/dev/null | grep "State :" | awk '{print $3}') | |
| failed_devices=$(mdadm --detail "$RAID_DEVICE" 2>/dev/null | grep "Failed Devices :" | awk '{print $4}') | |
| if [[ $md2_state == "clean" ]]; then | |
| result "pass" "RAID array state (md2)" "$md2_state" "clean" | |
| else | |
| result "fail" "RAID array state (md2)" "$md2_state" "clean" | |
| fi | |
| if [[ $failed_devices == "0" ]]; then | |
| result "pass" "RAID failed devices" "$failed_devices" "0" | |
| else | |
| result "fail" "RAID failed devices" "$failed_devices" "0 — REPLACE DRIVE IMMEDIATELY" | |
| fi | |
| fi | |
| # --- NVMe drive health via /proc/diskstats errors -------------------------- | |
| for drive in nvme0n1 nvme1n1; do | |
| if [[ -b /dev/$drive ]]; then | |
| echo -e " ${INFO} Drive /dev/${drive} present and accessible" | |
| else | |
| echo -e " ${FAIL} Drive /dev/${drive} not found!" | |
| ((FAILURES++)) || true | |
| fi | |
| done | |
| echo "" | |
| # --- Disk space ------------------------------------------------------------ | |
| local used_pct | |
| used_pct=$(df / | awk 'NR==2{print $5}' | tr -d '%') | |
| if (( used_pct >= 90 )); then | |
| result "fail" "Root filesystem usage" "${used_pct}%" "< 90%" | |
| elif (( used_pct >= 75 )); then | |
| result "warn" "Root filesystem usage" "${used_pct}%" "< 75% recommended" | |
| else | |
| result "pass" "Root filesystem usage" "${used_pct}%" "< 75%" | |
| fi | |
| # --- fio benchmark (skipped in --quick mode) ------------------------------- | |
| if $QUICK; then | |
| echo -e " ${INFO} Storage benchmark skipped (--quick mode)" | |
| echo "" | |
| return | |
| fi | |
| if ! command -v fio &>/dev/null; then | |
| echo -e " ${WARN} fio not found — skipping storage benchmark (apt install fio)" | |
| echo "" | |
| return | |
| fi | |
| # Check free space (need at least 6GB for test file) | |
| local free_kb free_gb | |
| free_kb=$(df / | awk 'NR==2{print $4}') | |
| free_gb=$(( free_kb / 1024 / 1024 )) | |
| if (( free_gb < 6 )); then | |
| echo -e " ${WARN} Less than 6GB free — skipping fio test" | |
| echo "" | |
| return | |
| fi | |
| echo -e " ${INFO} Running fio sequential read (30s)..." | |
| local seq_read_bw | |
| seq_read_bw=$(fio --name=seq-read --rw=read --bs=1M --size=4G \ | |
| --numjobs=1 --iodepth=8 --runtime=30 --time_based \ | |
| --ioengine=libaio --direct=1 --filename="$FIO_TESTFILE" \ | |
| --output-format=json 2>/dev/null \ | |
| | python3 -c "import sys,json; d=json.load(sys.stdin); print(int(d['jobs'][0]['read']['bw']/1024))") | |
| if (( seq_read_bw >= STORAGE_SEQ_READ_MIN )); then | |
| result "pass" "Sequential read" "${seq_read_bw} MB/s" ">= ${STORAGE_SEQ_READ_MIN} MB/s" | |
| else | |
| result "fail" "Sequential read" "${seq_read_bw} MB/s" ">= ${STORAGE_SEQ_READ_MIN} MB/s" | |
| fi | |
| echo -e " ${INFO} Running fio sequential write (30s)..." | |
| local seq_write_bw | |
| seq_write_bw=$(fio --name=seq-write --rw=write --bs=1M --size=4G \ | |
| --numjobs=1 --iodepth=8 --runtime=30 --time_based \ | |
| --ioengine=libaio --direct=1 --filename="$FIO_TESTFILE" \ | |
| --output-format=json 2>/dev/null \ | |
| | python3 -c "import sys,json; d=json.load(sys.stdin); print(int(d['jobs'][0]['write']['bw']/1024))") | |
| if (( seq_write_bw >= STORAGE_SEQ_WRITE_MIN )); then | |
| result "pass" "Sequential write" "${seq_write_bw} MB/s" ">= ${STORAGE_SEQ_WRITE_MIN} MB/s" | |
| else | |
| result "fail" "Sequential write" "${seq_write_bw} MB/s" ">= ${STORAGE_SEQ_WRITE_MIN} MB/s" | |
| fi | |
| echo -e " ${INFO} Running fio random 4K read (30s)..." | |
| local rand_read_iops | |
| rand_read_iops=$(fio --name=rand-read --rw=randread --bs=4k --size=4G \ | |
| --numjobs=4 --iodepth=32 --runtime=30 --time_based \ | |
| --ioengine=libaio --direct=1 --filename="$FIO_TESTFILE" \ | |
| --output-format=json 2>/dev/null \ | |
| | python3 -c "import sys,json; d=json.load(sys.stdin); print(int(sum(j['read']['iops'] for j in d['jobs'])))") | |
| if (( rand_read_iops >= STORAGE_RAND_READ_MIN )); then | |
| result "pass" "Random 4K read IOPS" "${rand_read_iops}" ">= ${STORAGE_RAND_READ_MIN}" | |
| else | |
| result "fail" "Random 4K read IOPS" "${rand_read_iops}" ">= ${STORAGE_RAND_READ_MIN}" | |
| fi | |
| echo -e " ${INFO} Running fio random 4K write (30s)..." | |
| local rand_write_iops | |
| rand_write_iops=$(fio --name=rand-write --rw=randwrite --bs=4k --size=4G \ | |
| --numjobs=4 --iodepth=32 --runtime=30 --time_based \ | |
| --ioengine=libaio --direct=1 --filename="$FIO_TESTFILE" \ | |
| --output-format=json 2>/dev/null \ | |
| | python3 -c "import sys,json; d=json.load(sys.stdin); print(int(sum(j['write']['iops'] for j in d['jobs'])))") | |
| if (( rand_write_iops >= STORAGE_RAND_WRITE_MIN )); then | |
| result "pass" "Random 4K write IOPS" "${rand_write_iops}" ">= ${STORAGE_RAND_WRITE_MIN}" | |
| else | |
| result "fail" "Random 4K write IOPS" "${rand_write_iops}" ">= ${STORAGE_RAND_WRITE_MIN}" | |
| fi | |
| rm -f "$FIO_TESTFILE" | |
| } | |
| # ============================================================================= | |
| # Summary | |
| # ============================================================================= | |
| print_summary() { | |
| header "Summary" | |
| local timestamp | |
| timestamp=$(date '+%Y-%m-%d %H:%M:%S') | |
| echo -e " ${INFO} Completed at: ${timestamp}" | |
| echo "" | |
| if (( FAILURES > 0 )); then | |
| echo -e " ${RED}${BOLD}RESULT: ${FAILURES} failure(s), ${WARNINGS} warning(s)${RESET}" | |
| echo -e " ${RED}System requires attention.${RESET}" | |
| elif (( WARNINGS > 0 )); then | |
| echo -e " ${YELLOW}${BOLD}RESULT: 0 failures, ${WARNINGS} warning(s)${RESET}" | |
| echo -e " ${YELLOW}System is functional but review warnings above.${RESET}" | |
| else | |
| echo -e " ${GREEN}${BOLD}RESULT: All checks passed.${RESET}" | |
| echo -e " ${GREEN}System is healthy.${RESET}" | |
| fi | |
| echo "" | |
| # Optionally log to file | |
| local logfile="/var/log/healthcheck.log" | |
| echo "[${timestamp}] Failures: ${FAILURES} Warnings: ${WARNINGS}" >> "$logfile" 2>/dev/null || true | |
| echo -e " ${INFO} Result appended to ${logfile}" | |
| echo "" | |
| } | |
| # ============================================================================= | |
| # Main | |
| # ============================================================================= | |
| require_root | |
| echo "" | |
| echo -e "${BOLD}System Health Check${RESET}" | |
| echo -e "Host: $(hostname) | Kernel: $(uname -r) | $(date)" | |
| $QUICK && echo -e "${INFO} Quick mode — storage benchmark skipped" | |
| $FIX && echo -e "${INFO} Fix mode — will attempt to correct issues" | |
| echo "" | |
| check_deps | |
| check_cpu | |
| check_memory | |
| check_storage | |
| print_summary | |
| exit $FAILURES |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment