MattsonThieme · May 18, 2018 17:19
diff --git a/multi_stream_intel_tf_cnn_benchmarks.sh b/multi_stream_intel_tf_cnn_benchmarks.sh
 # !/bin/bash
 # Mattson Thieme | 2018
 # Run training in TensorFlow's tf_cnn benchmarks with four workers and one ps on a 36 core Skylake

 # Activate TensorFlow virtual environment
 source activate tensorflow_p36

 # Set worker parameters - may need to update port numbers when running on your machine
 ps_list="127.0.0.1:26214"
 workers_list="127.0.0.1:1104,127.0.0.1:10628,127.0.0.1:1124,127.0.0.1:2003"
 worker_env="export OMP_NUM_THREADS=9"
 ps_env="export OMP_NUM_THREADS=9"
 ps_args="--num_intra_threads 4 --num_inter_threads 2"
 worker_args="--num_intra_threads 9 --num_inter_threads 4"

 # Clone benchmark scripts
 git clone -b mkl_experiment https://github.com/tensorflow/benchmarks.git  # May need to change line in benchmarks/scripts/tf_cnn_benchmarks/datasets.py from 'import cPickle' to 'import _pickle as cPickle' if using python3
 cd benchmarks/scripts/tf_cnn_benchmarks
 rm *.log # remove logs from any previous benchmark runs

 ## Run training benchmark scripts

 networks=( inception3 resnet50 resnet152 vgg16 )  # May run out of memory with four workers on larger networks
 batch_sizes=( 32 64 96 )
 num_batches=30

 for network in "${networks[@]}" ; do

  # Start single PS task for each topology
  numactl -l python tf_cnn_benchmarks.py $ps_args --job_name ps --task_index 0 --ps_hosts $ps_list --worker_hosts $workers_list &

  # Start worker tasks
  for bs in "${batch_sizes[@]}"; do
    echo -e "\n\n #### Starting $network and batch size = $bs ####\n\n"

    # Worker 0
    $worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[0-9,35-41],explicit,verbose" --job_name worker --task_index 0 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 | tee net_"$network"_bs_"$bs"_0.log &

    # Worker 1
    $worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[10-19,42-48],explicit,verbose" --job_name worker --task_index 1 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 | tee net_"$network"_bs_"$bs"_1.log &

    # Worker 2
    $worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[20-29,49-55],explicit,verbose" --job_name worker --task_index 2 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 | tee net_"$network"_bs_"$bs"_2.log &

    # Worker 3
    $worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[30-36,56-72],explicit,verbose" --job_name worker --task_index 3 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 | tee net_"$network"_bs_"$bs"_3.log

  done

  # Kill previous PS task
  kill $(pidof python)

 done

 ## Print throughput
 workers=( 0 1 2 3 )
 sleep 1
 echo -e "\n Network batch_size images/second worker\n"

 for network in "${networks[@]}" ; do
  for bs in "${batch_sizes[@]}"; do
    for worker in "${workers[@]}"; do
      fps=$(grep  "total images/sec:"  net_"$network"_bs_"$bs"_"$worker".log | cut -d ":" -f2 | xargs)
      echo "$network $bs $fps $worker"
    done
  done
    echo -e "\n"
 done

 # Deactivate virtual environment
 source deactivate
	# !/bin/bash
	# Mattson Thieme \| 2018
	# Run training in TensorFlow's tf_cnn benchmarks with four workers and one ps on a 36 core Skylake

	# Activate TensorFlow virtual environment
	source activate tensorflow_p36

	# Set worker parameters - may need to update port numbers when running on your machine
	ps_list="127.0.0.1:26214"
	workers_list="127.0.0.1:1104,127.0.0.1:10628,127.0.0.1:1124,127.0.0.1:2003"
	worker_env="export OMP_NUM_THREADS=9"
	ps_env="export OMP_NUM_THREADS=9"
	ps_args="--num_intra_threads 4 --num_inter_threads 2"
	worker_args="--num_intra_threads 9 --num_inter_threads 4"

	# Clone benchmark scripts
	git clone -b mkl_experiment https://github.com/tensorflow/benchmarks.git # May need to change line in benchmarks/scripts/tf_cnn_benchmarks/datasets.py from 'import cPickle' to 'import _pickle as cPickle' if using python3
	cd benchmarks/scripts/tf_cnn_benchmarks
	rm *.log # remove logs from any previous benchmark runs

	## Run training benchmark scripts

	networks=( inception3 resnet50 resnet152 vgg16 ) # May run out of memory with four workers on larger networks
	batch_sizes=( 32 64 96 )
	num_batches=30

	for network in "${networks[@]}" ; do

	# Start single PS task for each topology
	numactl -l python tf_cnn_benchmarks.py $ps_args --job_name ps --task_index 0 --ps_hosts $ps_list --worker_hosts $workers_list &

	# Start worker tasks
	for bs in "${batch_sizes[@]}"; do
	echo -e "\n\n #### Starting $network and batch size = $bs ####\n\n"

	# Worker 0
	$worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[0-9,35-41],explicit,verbose" --job_name worker --task_index 0 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 \| tee net_"$network"_bs_"$bs"_0.log &

	# Worker 1
	$worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[10-19,42-48],explicit,verbose" --job_name worker --task_index 1 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 \| tee net_"$network"_bs_"$bs"_1.log &

	# Worker 2
	$worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[20-29,49-55],explicit,verbose" --job_name worker --task_index 2 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 \| tee net_"$network"_bs_"$bs"_2.log &

	# Worker 3
	$worker_env;nohup numactl -m 0 python tf_cnn_benchmarks.py $worker_args --sync_on_finish True --kmp_affinity="granularity=thread,proclist=[30-36,56-72],explicit,verbose" --job_name worker --task_index 3 --device cpu --data_format NCHW --cpu skl --data_name synthetic --model "$network" --learning_rate 0.001 --num_epochs_per_decay 2 --batch_size "$bs" --optimizer rmsprop --ps_hosts $ps_list --worker_hosts $workers_list --num_batches $num_batches 2>&1 \| tee net_"$network"_bs_"$bs"_3.log

	done

	# Kill previous PS task
	kill $(pidof python)

	done

	## Print throughput
	workers=( 0 1 2 3 )
	sleep 1
	echo -e "\n Network batch_size images/second worker\n"

	for network in "${networks[@]}" ; do
	for bs in "${batch_sizes[@]}"; do
	for worker in "${workers[@]}"; do
	fps=$(grep "total images/sec:" net_"$network"_bs_"$bs"_"$worker".log \| cut -d ":" -f2 \| xargs)
	echo "$network $bs $fps $worker"
	done
	done
	echo -e "\n"
	done

	# Deactivate virtual environment
	source deactivate