Last active
January 12, 2018 02:23
-
-
Save dcapwell/d290fcc82710c43c38482db5b8939629 to your computer and use it in GitHub Desktop.
Usage, Saturation, Errors checklist script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
## | |
## Script is a automation for collecting metrics that represent (U)sage, (S)aturation, and (E)rrors (USE). | |
## This script is mostly automating [1], so more details can be found there | |
## | |
## References | |
## * [1] http://www.brendangregg.com/USEmethod/use-linux.html | |
## | |
#set -x | |
#set -e | |
#set -o pipefail | |
set -u | |
function banner() { | |
echo "" | |
echo "### $@ ###" | |
echo "" | |
} | |
function comment() { | |
echo "// $@" | |
} | |
function run() { | |
echo '' | |
echo "[COMMAND] $@" | |
eval "$@" 2>&1 || true | |
} | |
DELAY=1 | |
COUNT=10 | |
banner "Host Details" | |
run hostname -f | |
run cat /etc/*release* | |
run uname -a | |
run ethtool eth0 | |
run cat /proc/cpuinfo | |
run lscpu | |
run numactl --hardware | |
run uptime | |
# should be 1 | |
run cat /proc/sys/kernel/perf_event_paranoid | |
# should be 0 | |
run cat /proc/sys/kernel/kptr_restrict | |
run sudo -n perf list | |
run ps -ef | |
banner "CPU utilization (system-wide)" | |
comment '"us" + "sy" + "st"' | |
run vmstat -n $DELAY $COUNT | |
comment 'sum fields except "%idle" and "%iowait"' | |
run sar -u | |
comment 'sum fields except "idl" and "wai"' | |
run dstat -c $DELAY $COUNT | |
run numastat | |
run sudo -n perf stat -a -- sleep 30 | |
banner "CPU utilization (per-cpu)" | |
# could also run sar -P ALL if mpstat is not installed | |
comment 'sum fields except "%idle" and "%iowait"' | |
run mpstat -P ALL $DELAY $COUNT | |
banner "CPU utilization (per-process)" | |
comment '"%CPU"' | |
run "echo P | top -b -d $DELAY -n $COUNT" | |
comment '"%CPU"' | |
run pidstat $DELAY $COUNT | |
banner "CPU saturation (system-wide)" | |
comment '"r" > CPU count' | |
run vmstat $DELAY $COUNT | |
comment '"runq-sz" > CPU count' | |
run sar -q | |
comment '"run" > CPU count' | |
run dstat -p $DELAY $COUNT | |
banner "CPU saturation (per-process)" | |
comment '2nd field (sched_info.run_delay)' | |
for pid in $(ps -A -o pid); do | |
run cat /proc/$pid/schedstat || true | |
done | |
# some kernel settings block this, so don't fail and move on | |
comment '(shows "Average" and "Maximum" delay per-schedule)' | |
run "sudo -n perf sched record -- sleep 1 && sudo -n perf sched latency -v" || true | |
run "sudo -n perf timechart" | |
banner "Memory utilization (system-wide)" | |
comment '"Mem:" (main memory), "Swap:" (virtual memory' | |
run free -m | |
comment '"free" (main memory), "swap" (virtual memory)' | |
run vmstat $DELAY $COUNT | |
comment '"%memused"' | |
run sar -r | |
comment '"free"' | |
run dstat -m $DELAY $COUNT | |
comment 'for kmem slab usage' | |
run sudo -n slabtop -s c --once | |
banner "Memory saturation (system-wide)" | |
comment '"si"/"so" (swapping)' | |
run vmstat $DELAY $COUNT | |
comment '"pgscank" + "pgscand" (scanning)' | |
run sar -B | |
run sar -W | |
banner "Memory saturation (per-process)" | |
comment '10th field (min_flt)' | |
for pid in $(ps -A -o pid); do | |
run cat /proc/$pid/stat || true | |
done | |
banner "Memory errors" | |
run "dmesg | grep killed" | |
run dmesg | |
banner "Network utilization" | |
comment '"rxKB/s"/max "txKB/s"/max' | |
run sar -n DEV $DELAY $COUNT | |
comment 'RX/TX tput / max bandwidth' | |
run ip -s link | |
comment '"bytes" RX/TX tput/max' | |
run cat /proc/net/dev | |
banner "Network saturation" | |
comment '"overruns", "dropped"' | |
run ifconfig | |
comment '"segments retransmited"' | |
run netstat -s | |
comment '*drop and *fifo metrics' | |
run sar -n EDEV | |
banner "Network errors" | |
comment ' "errors", "dropped"' | |
run ifconfig | |
comment '"RX-ERR"/"TX-ERR"' | |
run netstat -i | |
comment '"rxerr/s" "txerr/s"' | |
run sar -n EDEV | |
comment '"errs", "drop"' | |
run cat /proc/net/dev | |
banner "I/O utilization (system-wide)" | |
comment '"%util"' | |
run iostat -xz $DELAY $COUNT | |
comment '"%util"' | |
run sar -d | |
banner "I/O utilization (per-process)" | |
run sudo -n iotop --batch --delay=$DELAY --iter=$COUNT | |
run pidstat -d | |
for pid in $(ps -A -o pid); do | |
run cat /proc/$pid/sched | grep 'se.statistics.iowait_sum' || true | |
done | |
banner "I/O saturation" | |
comment '"avgqu-sz" > 1, or high "await"' | |
run iostat -xnz $DELAY $COUNT | |
for d in $(find /sys/devices/ -name ioerr_cnt); do | |
run cat "$d" | |
done | |
banner "Storage" | |
run swapon -s | |
run free | |
run "cat /proc/meminfo | grep -i swap" | |
run df -h | |
run "sudo -n cat /var/log/messages | grep -i err" | |
banner "Storage Controller" | |
comment 'sum devices and compare to known IOPS/tput limits per-card' | |
run iostat -xz $DELAY $COUNT |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment