Last active
May 18, 2018 17:19
-
-
Save MattsonThieme/1f3ee54b561dd182ea55d99157be887a to your computer and use it in GitHub Desktop.
Run TensorFlow CNN benchmarks on a single node with multiple workers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# !/bin/bash | |
# Mattson Thieme | 2018 | |
# Run training in TensorFlow's tf_cnn benchmarks with four workers and one ps on a 36 core Skylake | |
# Activate TensorFlow virtual environment | |
source activate tensorflow_p36 | |
# Set worker parameters - may need to update port numbers when running on your machine | |
ps_list="127.0.0.1:26214" | |
workers_list="127.0.0.1:1104,127.0.0.1:10628,127.0.0.1:1124,127.0.0.1:2003" | |
worker_env="export OMP_NUM_THREADS=9" | |
ps_env="export OMP_NUM_THREADS=9" | |
ps_args="--num_intra_threads 4 --num_inter_threads 2" | |
worker_args="--num_intra_threads 9 --num_inter_threads 4" | |
# Clone benchmark scripts | |
git clone -b mkl_experiment https://github.com/tensorflow/benchmarks.git # May need to change line in benchmarks/scripts/tf_cnn_benchmarks/datasets.py from 'import cPickle' to 'import _pickle as cPickle' if using python3 | |
cd benchmarks/scripts/tf_cnn_benchmarks | |
rm *.log # remove logs from any previous benchmark runs | |
## Run training benchmark scripts | |
networks=( inception3 resnet50 resnet152 vgg16 ) # May run out of memory with four workers on larger networks | |
batch_sizes=( 32 64 96 ) | |
num_batches=30 | |
for network in "${networks[@]}" ; do | |
# Start single PS task for each topology | |
numactl -l python tf_cnn_benchmarks.py $ps_args --job_name ps --task_index 0 --ps_hosts $ps_list --worker_hosts $workers_list & | |
# Start worker tasks | |
for bs in "${batch_sizes[@]}"; do | |
echo -e "\n\n #### Starting $network and batch size = $bs ####\n\n" | |
# Worker 0 | |
$worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[0-9,35-41],explicit,verbose" --job_name worker --task_index 0 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 | tee net_"$network"_bs_"$bs"_0.log & | |
# Worker 1 | |
$worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[10-19,42-48],explicit,verbose" --job_name worker --task_index 1 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 | tee net_"$network"_bs_"$bs"_1.log & | |
# Worker 2 | |
$worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[20-29,49-55],explicit,verbose" --job_name worker --task_index 2 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 | tee net_"$network"_bs_"$bs"_2.log & | |
# Worker 3 | |
$worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[30-36,56-72],explicit,verbose" --job_name worker --task_index 3 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 | tee net_"$network"_bs_"$bs"_3.log | |
done | |
# Kill previous PS task | |
kill $(pidof python) | |
done | |
## Print throughput | |
workers=( 0 1 2 3 ) | |
sleep 1 | |
echo -e "\n Network batch_size images/second worker\n" | |
for network in "${networks[@]}" ; do | |
for bs in "${batch_sizes[@]}"; do | |
for worker in "${workers[@]}"; do | |
fps=$(grep "total images/sec:" net_"$network"_bs_"$bs"_"$worker".log | cut -d ":" -f2 | xargs) | |
echo "$network $bs $fps $worker" | |
done | |
done | |
echo -e "\n" | |
done | |
# Deactivate virtual environment | |
source deactivate |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment