July 11, 2017 16:39 · July 9, 2017 11:45 · July 4, 2017 05:09 · June 27, 2017 19:35 · June 27, 2017 19:33 · June 13, 2017 17:48
 package(default_visibility = ["//visibility:public"])

 licenses(["notice"]) # OpenIB.org BSD license (MIT variant)

 exports_files(["COPYING"])

 cc_library(
    name = "ibverbs",
    hdrs = [
        "include/infiniband/sa.h",
 diff --git a/tensorflow/core/distributed_runtime/rpc/rdma.cc b/tensorflow/core/distributed_runtime/rpc/rdma.cc
 index 145b24396..0e73c31b1 100644
 --- a/tensorflow/core/distributed_runtime/rpc/rdma.cc
 +++ b/tensorflow/core/distributed_runtime/rpc/rdma.cc
 @@ -310,34 +310,9 @@ class RdmaReadClient : public RdmaClient {
     }
 #endif
 
 -    // TODO: Remove code used for debugging purposes only
 -    string tensor_debug_string;
 # Dependencies:
 # portpicker (pip install portpicker)
 # tcmalloc4 (sudo apt-get install google-perftools)
 # TF 0.12
 #
 #
 # Benchmarks on Xeon E5-2630 v3 @ 2.40GHz
 #
 # export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
 # python benchmark_grpc_recv.py --data_mb=128
 2017-06-28 03:31:29.606714: E tensorflow/stream_executor/cuda/cuda_driver.cc:406] failed call to cuInit: CUDA_ERROR_NO_DEVICE
 2017-06-28 03:31:29.606826: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:158] retrieving CUDA diagnostic information for host: ip-192-168-2-200
 2017-06-28 03:31:29.606839: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:165] hostname: ip-192-168-2-200
 2017-06-28 03:31:29.606892: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:189] libcuda reported version is: 375.26.0
 2017-06-28 03:31:29.606950: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:369] driver version file contents: """NVRM version: NVIDIA UNIX x86_64 Kernel Module  375.26  Thu Dec  8 18:36:43 PST 2016
 GCC version:  gcc version 6.3.0 20170516 (Debian 6.3.0-18) 
 """
 2017-06-28 03:31:29.606973: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:193] kernel reported version is: 375.26.0
 2017-06-28 03:31:29.606981: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:300] kernel version seems to ma
 2017-06-28 03:31:29.957231: I tensorflow/core/common_runtime/gpu/gpu_device.cc:938] Found device 0 with properties: 
 name: Tesla K40m
 major: 3 minor: 5 memoryClockRate (GHz) 0.745
 pciBusID 0000:02:00.0
 Total memory: 11.17GiB
 Free memory: 11.10GiB
 2017-06-28 03:31:30.169520: W tensorflow/stream_executor/cuda/cuda_driver.cc:523] A non-primary context 0x5569248497c0 exists before initializing the StreamExecutor. We haven't verified StreamExecutor works with that.
 2017-06-28 03:31:30.171168: I tensorflow/core/common_runtime/gpu/gpu_device.cc:938] Found device 1 with properties: 
 name: Tesla K40m
 major: 3 minor: 5 memoryClockRate (GHz) 0.745
 $ python -m unittest discover -s tests -p "example_test.py"
 Testing something...

 <class 'tensorflow.contrib.rnn.python.ops.core_rnn_cell_impl.LSTMCell'>
 2017-06-14 01:46:17.783828: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
 2017-06-14 01:46:17.783914: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
 2017-06-14 01:46:17.783961: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
 2017-06-14 01:46:17.784013: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed 
 """Benchmark tensorflow distributed by adding vector of ones on worker2
 to variable on worker1 as fast as possible.

 On 2014 macbook, TensorFlow 0.10 this shows

 Local rate:       2175.28 MB per second
 Distributed rate: 107.13 MB per second

 """
 #!/bin/bash
 # Simple script to list version numbers of critical development tools
 export LC_ALL=C
 bash --version | head -n1 | cut -d" " -f2-4
 MYSH=$(readlink -f /bin/sh)
 echo "/bin/sh -> $MYSH"
 echo $MYSH | grep -q bash || echo "ERROR: /bin/sh does not point to bash"
 unset MYSH

 echo -n "Binutils: "; ld --version | head -n1 | cut -d" " -f3-
 Latency Comparison Numbers
 --------------------------
 L1 cache reference                           0.5 ns
 Branch mispredict                            5   ns
 L2 cache reference                           7   ns                      14x L1 cache
 Mutex lock/unlock                           25   ns
 Main memory reference                      100   ns                      20x L2 cache, 200x L1 cache
 Compress 1K bytes with Zippy             3,000   ns        3 us
 Send 1K bytes over 1 Gbps network       10,000   ns       10 us
 Read 4K randomly from SSD*             150,000   ns      150 us          ~1GB/sec SSD
	package(default_visibility = ["//visibility:public"])

	licenses(["notice"]) # OpenIB.org BSD license (MIT variant)

	exports_files(["COPYING"])

	cc_library(
	name = "ibverbs",
	hdrs = [
	"include/infiniband/sa.h",
	diff --git a/tensorflow/core/distributed_runtime/rpc/rdma.cc b/tensorflow/core/distributed_runtime/rpc/rdma.cc
	index 145b24396..0e73c31b1 100644
	--- a/tensorflow/core/distributed_runtime/rpc/rdma.cc
	+++ b/tensorflow/core/distributed_runtime/rpc/rdma.cc
	@@ -310,34 +310,9 @@ class RdmaReadClient : public RdmaClient {
	}
	#endif

	- // TODO: Remove code used for debugging purposes only
	- string tensor_debug_string;
	# Dependencies:
	# portpicker (pip install portpicker)
	# tcmalloc4 (sudo apt-get install google-perftools)
	# TF 0.12
	#
	#
	# Benchmarks on Xeon E5-2630 v3 @ 2.40GHz
	#
	# export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
	# python benchmark_grpc_recv.py --data_mb=128
	2017-06-28 03:31:29.606714: E tensorflow/stream_executor/cuda/cuda_driver.cc:406] failed call to cuInit: CUDA_ERROR_NO_DEVICE
	2017-06-28 03:31:29.606826: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:158] retrieving CUDA diagnostic information for host: ip-192-168-2-200
	2017-06-28 03:31:29.606839: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:165] hostname: ip-192-168-2-200
	2017-06-28 03:31:29.606892: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:189] libcuda reported version is: 375.26.0
	2017-06-28 03:31:29.606950: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:369] driver version file contents: """NVRM version: NVIDIA UNIX x86_64 Kernel Module 375.26 Thu Dec 8 18:36:43 PST 2016
	GCC version: gcc version 6.3.0 20170516 (Debian 6.3.0-18)
	"""
	2017-06-28 03:31:29.606973: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:193] kernel reported version is: 375.26.0
	2017-06-28 03:31:29.606981: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:300] kernel version seems to ma
	2017-06-28 03:31:29.957231: I tensorflow/core/common_runtime/gpu/gpu_device.cc:938] Found device 0 with properties:
	name: Tesla K40m
	major: 3 minor: 5 memoryClockRate (GHz) 0.745
	pciBusID 0000:02:00.0
	Total memory: 11.17GiB
	Free memory: 11.10GiB
	2017-06-28 03:31:30.169520: W tensorflow/stream_executor/cuda/cuda_driver.cc:523] A non-primary context 0x5569248497c0 exists before initializing the StreamExecutor. We haven't verified StreamExecutor works with that.
	2017-06-28 03:31:30.171168: I tensorflow/core/common_runtime/gpu/gpu_device.cc:938] Found device 1 with properties:
	name: Tesla K40m
	major: 3 minor: 5 memoryClockRate (GHz) 0.745
	$ python -m unittest discover -s tests -p "example_test.py"
	Testing something...

	<class 'tensorflow.contrib.rnn.python.ops.core_rnn_cell_impl.LSTMCell'>
	2017-06-14 01:46:17.783828: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
	2017-06-14 01:46:17.783914: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
	2017-06-14 01:46:17.783961: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
	2017-06-14 01:46:17.784013: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed
	"""Benchmark tensorflow distributed by adding vector of ones on worker2
	to variable on worker1 as fast as possible.

	On 2014 macbook, TensorFlow 0.10 this shows

	Local rate: 2175.28 MB per second
	Distributed rate: 107.13 MB per second

	"""
	#!/bin/bash
	# Simple script to list version numbers of critical development tools
	export LC_ALL=C
	bash --version \| head -n1 \| cut -d" " -f2-4
	MYSH=$(readlink -f /bin/sh)
	echo "/bin/sh -> $MYSH"
	echo $MYSH \| grep -q bash \|\| echo "ERROR: /bin/sh does not point to bash"
	unset MYSH

	echo -n "Binutils: "; ld --version \| head -n1 \| cut -d" " -f3-
	Latency Comparison Numbers
	--------------------------
	L1 cache reference 0.5 ns
	Branch mispredict 5 ns
	L2 cache reference 7 ns 14x L1 cache
	Mutex lock/unlock 25 ns
	Main memory reference 100 ns 20x L2 cache, 200x L1 cache
	Compress 1K bytes with Zippy 3,000 ns 3 us
	Send 1K bytes over 1 Gbps network 10,000 ns 10 us
	Read 4K randomly from SSD* 150,000 ns 150 us ~1GB/sec SSD