Last active
August 29, 2015 14:27
-
-
Save awreece/0a5a37dbdc9ff82ab55c to your computer and use it in GitHub Desktop.
[WORK IN PROGRESS] automatically generate latency graphs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Generates a "latency heat map", showing both on-CPU and off-CPU stacks. In addition, | |
# attempts to color code on-CPU based on CPI by diffling the on-CPU stacks sampled | |
# at ~1000 Hz and every ~3e6 instructions. Intuitively, if a stack is more common in | |
# frequency stacks than in count stacks, than it represents a "slower" stack. | |
# | |
perf_data_dir=$1 | |
dest_dir=$2 | |
function_filter=$3 | |
script_dir=$(dirname $0) | |
export PATH="$script_dir:$PATH" | |
freq_data_file=$perf_data_dir/freq.data | |
freq_ms_stacks_file=$dest_dir/freq.ms_stacks | |
sched_data_file=$perf_data_dir/sched.data | |
sched_diffed_stacks_file=$dest_dir/sched.diffed_stacks | |
count_data_file=$perf_data_dir/count.data | |
count_raw_stacks_file=$dest_dir/count.raw_stacks | |
latency_svg_file=$dest_dir/latency_graph.svg | |
# TODO(awreece) This is a gross hack that needs to go as soon as possible. | |
sudo -l | grep -q NOPASSWD || \ | |
(echo "Cannot execute passwordless sudo" >&2; false) || exit 1 | |
echo "Condensing stacks from $perf_data_dir into $dest_dir" | |
# | |
# Parse the perf data files into stacks for consumption by flamegraph.pl. | |
# Since we will be comparing sched stacks to cpu frequency stacks, we need to | |
# convert both to a common unit: weight in milliseconds. To simplify this | |
# logic, we perform the conversion before c++filt (which may add spaces, etc). | |
# | |
# Since we measured 997 samples/sec, we need to divide by 997/1000 to get ms. | |
sudo perf script -i "$freq_data_file" | stackcollapse-perf.pl | \ | |
awk '{print $1, int($2*1000/997)}' | \ | |
c++filt | grep "$function_filter" >"$freq_ms_stacks_file" & | |
# | |
# We do not need to convert the count stacks, since they are normalized | |
# in difffolded.pl and not directly compared to the sched stacks. | |
# | |
sudo perf script -i "$count_data_file" | stackcollapse-perf.pl | \ | |
c++filt | grep "$function_filter" >"$count_raw_stacks_file" & | |
# | |
# Since this reports sleep time in us, we need to divide by 1000 to get ms. | |
# To simplify later processing, we emit already "diffed" stacks at this stage. | |
# By emitting "0, <time ms>" for each sched stack and using `--negate` as an | |
# argument to flamegraph.pl below, we ensure the sched stacks will be colored | |
# bright blue and sized according to the time spent off cpu. | |
# | |
sudo perf script -f time,comm,pid,tid,event,ip,sym,dso,trace \ | |
-i "$sched_data_file" | \ | |
stackcollapse-perf-sched.awk | \ | |
awk '(int($2/1000) > 0) {print $1, 0, int($2/1000)}' | \ | |
c++filt | grep "$function_filter" >"$sched_diffed_stacks_file" & | |
wait | |
echo "done." | |
echo -n "Generating $latency_svg_file ... " | |
# | |
# We diff the stacks carefully to try to make stacks colored blue represent | |
# blocking or stalling for memory and stacks colored red represent efficient | |
# stacks. Intuitively, if a stack is more common in freq stacks (measured | |
# ~1000x a second) than in count stacks (measured every ~3M instructions on a | |
# ~3GHz machine), than it represents a "slower" stack. Since we normalize both | |
# counts, we only measure the proporition of the total samples a stack | |
# constitutes in both cases and the precise numbers are not important. | |
# | |
# If compare the stacks in this order and using `--negate` as an argument to | |
# flamegraph.pl below, we cause stacks that are sampled (proportionately) more | |
# frequently in count.stacks than in freq.stacks will be colored red (and | |
# vice versa will be colored blue) and sized according to the time spent | |
# on cpu. | |
# | |
cat <(difffolded.pl -n "$count_raw_stacks_file" "$freq_ms_stacks_file") \ | |
"$sched_diffed_stacks_file" | \ | |
flamegraph.pl --negate >"$latency_svg_file" | |
echo "done." | |
grep -q CDATA "$latency_svg_file" || \ | |
(echo "No data in flamegraph!" >&2; false) || \ | |
exit 22 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Collects perf.data files needed for a "latency heat map". | |
# | |
workdir=$1 | |
pids=$2 | |
function die | |
{ | |
echo "$@" >&2 | |
exit 1 | |
} | |
sudo -l | grep -q NOPASSWD || die "Cannot execute passwordless sudo" | |
sudo mkdir -p "$workdir" || die "Failure to create $workdir" | |
trap finish INT QUIT TERM PIPE HUP | |
function finish | |
{ | |
[[ -z $perf_count_pid && -z $perf_freq_pid && -z $perf_sched_pid ]] || \ | |
sudo kill $perf_count_pid $perf_freq_pid $perf_sched_pid | |
wait | |
} | |
# Aim to sample every ~3M instructions since moderns CPUs are ~3GHz. | |
sudo perf record --output "$workdir/count.data" --pid=$pids --call-graph=fp \ | |
--event cycles:u --count $((2**10 * 997 * 3)) & | |
perf_count_pid=$! | |
# Sample at 997 Hz to avoid sampling in lockstep with something at 1000Hz. | |
sudo perf record --output "$workdir/freq.data" --pid=$pids --call-graph=fp \ | |
--event cycles:u --freq 997 & | |
perf_freq_pid=$! | |
sudo perf record --output "$workdir/sched.data" --pid=$pids --call-graph=fp \ | |
--event sched:sched_switch --event sched:sched_stat_sleep \ | |
--event sched:sched_stat_blocked --event sched:sched_process_exit & | |
perf_sched_pid=$! | |
wait |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment