ageron · January 11, 2018 11:35
diff --git a/test_tf_inter_op_thread_pool.py b/test_tf_inter_op_thread_pool.py
 from __future__ import division, print_function
 import tensorflow as tf

 """
 This program tries to test whether or not TensorFlow implements an inter-op thread pool on GPUs. In other words,
 it checks whether or not operations that don't depend on each other can actually run in parallel.
 To check this, it creates a TensorFlow graph that computes 1 + 1/2 + 1/4 + 1/8 + ...
 There are two variables `x` and `y`, and two operations that modify these variables:
 * `add` computes x <- x + y
 * `divide` computes y <- y / 2
 There is no explicit dependency between the `add` and `divide` operations, so if there is an inter-op thread
 pool, then TensorFlow will try to run them in parallel. If this is the case, sometimes `add` will execute first,
 and sometimes `divide` will execute first.
 For each device, the code runs three experiments:
 1) run 2000 iterations, and at each iteration manually evaluate `add`, then `divide`. This forces the execution
 order, so the end result should always be 2.0, regardless of the presence or absence of an inter-op thread pool.
 We do 20 runs of all this, so it should display 2.0 a total of 20 times.
 2) run 2000 iterations, but this time evaluate both `add` and `divide` simultaneously: `sess.run([add, divide])`.
 If there is an inter-op thread pool, then the order of execution at each iteration may change. We may end up with
 the order add, divide, divide, add, divide add, add, divide, etc. or another order, depending on the CPU speed and
 load. So the result may change at each run.
 3) do the same as 2), but evaluate `sess.run([divide, add])` instead.

 Here are the results:
 * unsurprisingly, the first experiment prints 2.0 twenty times, both for the CPU and the GPU. It's a sanity check,
  and it works.
 * the second experiment prints 1.00049, 1.0, 1.00012, 1.0, ..., 1.5, 1.00403 for the CPU, but it display 2.0 twenty
  times for the GPU. This confirms that there is an inter-op thread pool on the CPU, but it seems to show that
  there is no inter-op thread pool on the GPU. I tried to run the program while the GPU was busing doing something
  else, but it did not change the result. This is not a hard proof, because it is conceivable that the operations
  are run in parallel, but the `add` operation always finishes first because it is shorter to compute than `divide`.
  But it seems very unlikely. So my conclusion is that TensorFlow has no inter-op thread pool for GPUs. This makes
  sense if most operations use heavily multithreaded implementations (e.g., cuDNN) that already use up all the GPU's
  threads: there would be no significant performance gain in running multiple operations in parallel, they would
  just compete against each other, not actually run in parallel.
 * the third experiment has the same results. This shows that it makes no difference whether you run
  `sess.run([add, divide])` or `sess.run([divide, add])`. The order is decided deterministically by TensorFlow, and
  it seems to ignore the order of the operations in the list of operations to evaluate.
 """

 for device in ("/cpu:0", "/gpu:0"):
    print("-" * 80)
    print("Device:", device)
    graph = tf.Graph()
    with graph.as_default():
        with tf.device(device):
            x = tf.Variable(0.0)
            y = tf.Variable(1.0)
            add = tf.assign(x, x + y)
            divide = tf.assign(y, y / 2)
            init = tf.global_variables_initializer()

    print("Experiment #1: manual sequential execution")
    for execution in range(20):
        with tf.Session(graph=graph) as sess:
            init.run()
            for i in range(2000):
                sess.run(add)
                sess.run(divide)
            print(x.eval())

    print("Experiment #2: possible parallel execution")
    for execution in range(20):
        with tf.Session(graph=graph) as sess:
            init.run()
            for i in range(2000):
                sess.run([add, divide])
            print(x.eval())


    print("Experiment #3: possible parallel execution, reversed")
    for execution in range(20):
        with tf.Session(graph=graph) as sess:
            init.run()
            for i in range(2000):
                sess.run([divide, add])
            print(x.eval())
	from __future__ import division, print_function
	import tensorflow as tf

	"""
	This program tries to test whether or not TensorFlow implements an inter-op thread pool on GPUs. In other words,
	it checks whether or not operations that don't depend on each other can actually run in parallel.
	To check this, it creates a TensorFlow graph that computes 1 + 1/2 + 1/4 + 1/8 + ...
	There are two variables `x` and `y`, and two operations that modify these variables:
	* `add` computes x <- x + y
	* `divide` computes y <- y / 2
	There is no explicit dependency between the `add` and `divide` operations, so if there is an inter-op thread
	pool, then TensorFlow will try to run them in parallel. If this is the case, sometimes `add` will execute first,
	and sometimes `divide` will execute first.
	For each device, the code runs three experiments:
	1) run 2000 iterations, and at each iteration manually evaluate `add`, then `divide`. This forces the execution
	order, so the end result should always be 2.0, regardless of the presence or absence of an inter-op thread pool.
	We do 20 runs of all this, so it should display 2.0 a total of 20 times.
	2) run 2000 iterations, but this time evaluate both `add` and `divide` simultaneously: `sess.run([add, divide])`.
	If there is an inter-op thread pool, then the order of execution at each iteration may change. We may end up with
	the order add, divide, divide, add, divide add, add, divide, etc. or another order, depending on the CPU speed and
	load. So the result may change at each run.
	3) do the same as 2), but evaluate `sess.run([divide, add])` instead.

	Here are the results:
	* unsurprisingly, the first experiment prints 2.0 twenty times, both for the CPU and the GPU. It's a sanity check,
	and it works.
	* the second experiment prints 1.00049, 1.0, 1.00012, 1.0, ..., 1.5, 1.00403 for the CPU, but it display 2.0 twenty
	times for the GPU. This confirms that there is an inter-op thread pool on the CPU, but it seems to show that
	there is no inter-op thread pool on the GPU. I tried to run the program while the GPU was busing doing something
	else, but it did not change the result. This is not a hard proof, because it is conceivable that the operations
	are run in parallel, but the `add` operation always finishes first because it is shorter to compute than `divide`.
	But it seems very unlikely. So my conclusion is that TensorFlow has no inter-op thread pool for GPUs. This makes
	sense if most operations use heavily multithreaded implementations (e.g., cuDNN) that already use up all the GPU's
	threads: there would be no significant performance gain in running multiple operations in parallel, they would
	just compete against each other, not actually run in parallel.
	* the third experiment has the same results. This shows that it makes no difference whether you run
	`sess.run([add, divide])` or `sess.run([divide, add])`. The order is decided deterministically by TensorFlow, and
	it seems to ignore the order of the operations in the list of operations to evaluate.
	"""

	for device in ("/cpu:0", "/gpu:0"):
	print("-" * 80)
	print("Device:", device)
	graph = tf.Graph()
	with graph.as_default():
	with tf.device(device):
	x = tf.Variable(0.0)
	y = tf.Variable(1.0)
	add = tf.assign(x, x + y)
	divide = tf.assign(y, y / 2)
	init = tf.global_variables_initializer()

	print("Experiment #1: manual sequential execution")
	for execution in range(20):
	with tf.Session(graph=graph) as sess:
	init.run()
	for i in range(2000):
	sess.run(add)
	sess.run(divide)
	print(x.eval())

	print("Experiment #2: possible parallel execution")
	for execution in range(20):
	with tf.Session(graph=graph) as sess:
	init.run()
	for i in range(2000):
	sess.run([add, divide])
	print(x.eval())


	print("Experiment #3: possible parallel execution, reversed")
	for execution in range(20):
	with tf.Session(graph=graph) as sess:
	init.run()
	for i in range(2000):
	sess.run([divide, add])
	print(x.eval())