Skip to content

Instantly share code, notes, and snippets.

@polynomial
Created August 27, 2019 18:15
Show Gist options
  • Save polynomial/30552f7352401b9f5ae4bc2d1f2b3deb to your computer and use it in GitHub Desktop.
Save polynomial/30552f7352401b9f5ae4bc2d1f2b3deb to your computer and use it in GitHub Desktop.
#!/bin/bash
dumpType=$1 # Should be "full" or "brief"
if [ "$dumpType" != "full" -a "$dumpType" != "brief" ]
then
echo "Parameter 1 should be \"full\" or \"brief\"!"
exit 1
fi
if [ -n "$2" ] ; then
s=$2
echo "Sleeping for a random period of time up to $s minutes"
/usr/bin/ruby -e "t=$s*60*rand(); puts \"Sleeping for #{t} seconds\"; sleep(t)"
fi
PATH="$PATH:/home/hadoop/bin/"
date
# Echo all further commands before running
set -v
# how long have we been up
uptime
# whats running, with ASCII-art process hierarchy (forest)
if [ "$dumpType" == "full" ]
then
ps auxwwwf
else
ps auxf --width=200
fi
# Top CPU users
ps auxwww --sort -%cpu | head -10
# Top memory users
ps auxwww --sort -rss | head -10
# hows the kernel looking
dmesg | tail -n 25
# dump instance controller log
tail -n 100 /emr/instance-controller/log/instance-controller.log
# dump instance controller stdout
tail -n 100 /emr/instance-controller/log/instance-controller.out
# dump log pusher log
tail -n 100 /emr/logpusher/log/logpusher.log
# dump log pusher stdout
tail -n 100 /emr/logpusher/log/logpusher.out
set +o verbose
# jstack could hang when connect to hung process.
# To prevent many of such hung jstack exhaust CPU/RAM resources,
# detect and kill all such jstack processes if there are 2+ pending.
jpids=`pgrep jstack`
jcount=$(echo $jpids | wc -w)
if [ "$jcount" -ge 2 ]; then
echo "Kill pending jstack processes $jpids"
for pid in $jpids
do
echo "kill -9 $pid"
kill -9 $pid
done
fi
# Until the fix for https://bugs.openjdk.java.net/browse/JDK-8075773 is available in our jdk, check
# jps for each individual user.
USERS=(`pgrep java | xargs ps -o user= | sort -u`)
for user in "${USERS[@]}"
do
echo "============= Java processes for user ${user} ============="
sudo -u ${user} jps -lv | grep -v Jps
echo "=========== End java processes for user ${user} ==========="
done
# dump jps and jstack for each jvm
if [ "$dumpType" == "full" ]
then
for user in "${USERS[@]}"
do
for pid in $(pgrep -u ${user} java)
do
echo
echo "============= Begin threads dump for process ${pid} of ${user} ============="
sudo -u ${user} jps -l | grep $pid
sudo -u ${user} jstack $pid
echo "============== End threads dump for process ${pid} of ${user} =============="
echo
done
done
fi
set -o verbose
# dump service nanny log
tail -n 20 /emr/service-nanny/log/service-nanny-`date +%Y-%m-%d`
# dump mysql process list
mysqladmin -u root processlist | awk -F'|' '{print $6, $7, $8, $9}' | grep -v '^\s*$'
# Can we contact dom0?
wget -O - -T 2 -t 1 http://169.254.169.254/latest/user-data >/dev/null
# Can we access master?
set +o verbose
#we turned off verbose because it doesnt work well with if statements - prints both sides
MASTER=`cat /emr/instance-controller/lib/info/extraInstanceData.json | ruby -e 'require "json"; puts JSON.parse($stdin.read)["masterHost"]'`
if [ "$MASTER" == "localhost" ] ; then
echo "On master, attempting to access port 8443"
echo "" | nc -w 2 -v localhost 8443
echo
if [ "$dumpType" == "full" ]
then
RM_PID=`ps auxwww | grep ResourceManager | head -1 | awk '{print $2}'`
if [ "x${RM_PID}" != "x" ]; then
echo "ResourceManager heap stats:"
jmap -heap $RM_PID
fi
JT_PID=`ps auxwww | grep jobtracke[r] | awk '{print $2}'`
if [ "x${JT_PID}" != "x" ]; then
echo "Jobtracker heap stats:"
jmap -heap $JT_PID
fi
NN_PID=`ps auxwww | grep namenod[e] | awk '{print $2}'`
if [ "x${NN_PID}" != "x" ]; then
echo "Namenode heap stats:"
jmap -heap $NN_PID
fi
# Open connections
netstat -np
# HDFS usage
echo HDFS usage:
su hadoop -c "hdfs dfsadmin -report"
fi
fi
/usr/bin/traceroute2masters.rb
set -o verbose
# Can we access the outside world?
curl -s -I --connect-timeout 1 --max-time 5 http://elasticmapreduce.s3.amazonaws.com | head
# Now traceroute it
traceroute -T --sport=17241 -p 443 -w 3 -n -m 10 elasticmapreduce.s3.amazonaws.com
# listing of last logged in users
last -w -n 25
# whats io usage look like
iostat -x 1 5
# whats memory usage look like
free -m
# trend memory
vmstat 1 5
# amount of disk free
df -h
set +o verbose
# whats using the disk
df|awk '(NR>1){
drive=$NF
split($5,a,"%")
used=a[1]
if(used>95 || drive=="/emr"){
if(drive=="/"){
drive="/home"
}
printf("Top 10 folders in %s in MB:\n", drive)
system("du -mxS "drive"|sort -rn -k1|head -n 10")
printf("\n")
printf("Top 20 folders including subdirectories in %s in MB:\n", drive)
system("du -mx "drive"|sort -rn -k1,1 -k2,2d|head -n 20")
printf("\n")
}
}'
set -o verbose
# dump network statistics
netstat -s -e
# dump instance metrics
if [ "$dumpType" == "full" ]
then
mon-tool --local_metrics
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment