Created
August 27, 2019 18:15
-
-
Save polynomial/30552f7352401b9f5ae4bc2d1f2b3deb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
dumpType=$1 # Should be "full" or "brief" | |
if [ "$dumpType" != "full" -a "$dumpType" != "brief" ] | |
then | |
echo "Parameter 1 should be \"full\" or \"brief\"!" | |
exit 1 | |
fi | |
if [ -n "$2" ] ; then | |
s=$2 | |
echo "Sleeping for a random period of time up to $s minutes" | |
/usr/bin/ruby -e "t=$s*60*rand(); puts \"Sleeping for #{t} seconds\"; sleep(t)" | |
fi | |
PATH="$PATH:/home/hadoop/bin/" | |
date | |
# Echo all further commands before running | |
set -v | |
# how long have we been up | |
uptime | |
# whats running, with ASCII-art process hierarchy (forest) | |
if [ "$dumpType" == "full" ] | |
then | |
ps auxwwwf | |
else | |
ps auxf --width=200 | |
fi | |
# Top CPU users | |
ps auxwww --sort -%cpu | head -10 | |
# Top memory users | |
ps auxwww --sort -rss | head -10 | |
# hows the kernel looking | |
dmesg | tail -n 25 | |
# dump instance controller log | |
tail -n 100 /emr/instance-controller/log/instance-controller.log | |
# dump instance controller stdout | |
tail -n 100 /emr/instance-controller/log/instance-controller.out | |
# dump log pusher log | |
tail -n 100 /emr/logpusher/log/logpusher.log | |
# dump log pusher stdout | |
tail -n 100 /emr/logpusher/log/logpusher.out | |
set +o verbose | |
# jstack could hang when connect to hung process. | |
# To prevent many of such hung jstack exhaust CPU/RAM resources, | |
# detect and kill all such jstack processes if there are 2+ pending. | |
jpids=`pgrep jstack` | |
jcount=$(echo $jpids | wc -w) | |
if [ "$jcount" -ge 2 ]; then | |
echo "Kill pending jstack processes $jpids" | |
for pid in $jpids | |
do | |
echo "kill -9 $pid" | |
kill -9 $pid | |
done | |
fi | |
# Until the fix for https://bugs.openjdk.java.net/browse/JDK-8075773 is available in our jdk, check | |
# jps for each individual user. | |
USERS=(`pgrep java | xargs ps -o user= | sort -u`) | |
for user in "${USERS[@]}" | |
do | |
echo "============= Java processes for user ${user} =============" | |
sudo -u ${user} jps -lv | grep -v Jps | |
echo "=========== End java processes for user ${user} ===========" | |
done | |
# dump jps and jstack for each jvm | |
if [ "$dumpType" == "full" ] | |
then | |
for user in "${USERS[@]}" | |
do | |
for pid in $(pgrep -u ${user} java) | |
do | |
echo | |
echo "============= Begin threads dump for process ${pid} of ${user} =============" | |
sudo -u ${user} jps -l | grep $pid | |
sudo -u ${user} jstack $pid | |
echo "============== End threads dump for process ${pid} of ${user} ==============" | |
echo | |
done | |
done | |
fi | |
set -o verbose | |
# dump service nanny log | |
tail -n 20 /emr/service-nanny/log/service-nanny-`date +%Y-%m-%d` | |
# dump mysql process list | |
mysqladmin -u root processlist | awk -F'|' '{print $6, $7, $8, $9}' | grep -v '^\s*$' | |
# Can we contact dom0? | |
wget -O - -T 2 -t 1 http://169.254.169.254/latest/user-data >/dev/null | |
# Can we access master? | |
set +o verbose | |
#we turned off verbose because it doesnt work well with if statements - prints both sides | |
MASTER=`cat /emr/instance-controller/lib/info/extraInstanceData.json | ruby -e 'require "json"; puts JSON.parse($stdin.read)["masterHost"]'` | |
if [ "$MASTER" == "localhost" ] ; then | |
echo "On master, attempting to access port 8443" | |
echo "" | nc -w 2 -v localhost 8443 | |
echo | |
if [ "$dumpType" == "full" ] | |
then | |
RM_PID=`ps auxwww | grep ResourceManager | head -1 | awk '{print $2}'` | |
if [ "x${RM_PID}" != "x" ]; then | |
echo "ResourceManager heap stats:" | |
jmap -heap $RM_PID | |
fi | |
JT_PID=`ps auxwww | grep jobtracke[r] | awk '{print $2}'` | |
if [ "x${JT_PID}" != "x" ]; then | |
echo "Jobtracker heap stats:" | |
jmap -heap $JT_PID | |
fi | |
NN_PID=`ps auxwww | grep namenod[e] | awk '{print $2}'` | |
if [ "x${NN_PID}" != "x" ]; then | |
echo "Namenode heap stats:" | |
jmap -heap $NN_PID | |
fi | |
# Open connections | |
netstat -np | |
# HDFS usage | |
echo HDFS usage: | |
su hadoop -c "hdfs dfsadmin -report" | |
fi | |
fi | |
/usr/bin/traceroute2masters.rb | |
set -o verbose | |
# Can we access the outside world? | |
curl -s -I --connect-timeout 1 --max-time 5 http://elasticmapreduce.s3.amazonaws.com | head | |
# Now traceroute it | |
traceroute -T --sport=17241 -p 443 -w 3 -n -m 10 elasticmapreduce.s3.amazonaws.com | |
# listing of last logged in users | |
last -w -n 25 | |
# whats io usage look like | |
iostat -x 1 5 | |
# whats memory usage look like | |
free -m | |
# trend memory | |
vmstat 1 5 | |
# amount of disk free | |
df -h | |
set +o verbose | |
# whats using the disk | |
df|awk '(NR>1){ | |
drive=$NF | |
split($5,a,"%") | |
used=a[1] | |
if(used>95 || drive=="/emr"){ | |
if(drive=="/"){ | |
drive="/home" | |
} | |
printf("Top 10 folders in %s in MB:\n", drive) | |
system("du -mxS "drive"|sort -rn -k1|head -n 10") | |
printf("\n") | |
printf("Top 20 folders including subdirectories in %s in MB:\n", drive) | |
system("du -mx "drive"|sort -rn -k1,1 -k2,2d|head -n 20") | |
printf("\n") | |
} | |
}' | |
set -o verbose | |
# dump network statistics | |
netstat -s -e | |
# dump instance metrics | |
if [ "$dumpType" == "full" ] | |
then | |
mon-tool --local_metrics | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment