Skip to content

Instantly share code, notes, and snippets.

@yaroslavvb
yaroslavvb / xla-test.py
Created January 14, 2017 21:48
Simple XLA benchmark
# XLA compilation controlled by "compile_ops" option
# compile_ops=False: 4.39 sec
# compile_ops=True: 0.90 sec
import os
os.environ['CUDA_VISIBLE_DEVICES']=''
import tensorflow as tf
@yaroslavvb
yaroslavvb / show_graph
Created December 30, 2016 01:50
Visualizing graph in Jupyter notebook
# make things wide
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from IPython.display import clear_output, Image, display, HTML
def strip_consts(graph_def, max_const_size=32):
"""Strip large constant values from graph_def."""
strip_def = tf.GraphDef()
for n0 in graph_def.node:
# Try to copy "a" value to "c" while simultaneously adding vector of 1's to a.
# If the copy is started before the first assign_add, the copied value will be inconsistent.
#
# Running it on macbook my "c" ends up with a mix of values between 1 and 6
#
#
# 16.017478 copy 1 (0) starting
# 17.006894 write 1 (0) starting
# 28.431654 write 1 ending (11.4247 sec)
# 29.436692 write 1 (1) starting
@yaroslavvb
yaroslavvb / simple_barrier.py
Created December 16, 2016 06:03
Example of using shared counters to implement Barrier primitive
"""Example of barrier implementation using TensorFlow shared variables.
All workers synchronize on barrier, copy global parameters to local versions
and increment global parameter variable asynchronously. Should see something
like this:
bash> killall python
bash> python simple_barrier.py --num_workers=4
worker 0, local_param 4 global_param 5
worker 2, local_param 4 global_param 7
@yaroslavvb
yaroslavvb / benchmark_grpc_recv.py
Last active December 27, 2022 06:24
Benchmark slowness of passing Tensors around between TF workers
# Dependencies:
# portpicker (pip install portpicker)
# tcmalloc4 (sudo apt-get install google-perftools)
# TF 0.12
#
#
# Benchmarks on Xeon E5-2630 v3 @ 2.40GHz
#
# export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
# python benchmark_grpc_recv.py --data_mb=128
@yaroslavvb
yaroslavvb / count_ops.py
Created December 11, 2016 01:44
count number of ops in TensorFlow low-level API
from google.protobuf import text_format
from tensorflow.core.framework import op_def_pb2
ops = op_def_pb2.OpList()
ops_text = open("/local_home/yaroslav/tensorflow.git/tensorflow/tensorflow/core/ops/ops.pbtxt").read()
text_format.Merge(ops_text, ops)
print(len(ops.op))
@yaroslavvb
yaroslavvb / client_transfer_benchmark.py
Last active February 6, 2018 09:25
benchmark TensorFlow<->Python transfer rate
# Benchmark transferring data from TF into Python runtime
#
## Dependencies:
# portpicker (pip install portpicker)
# tcmalloc4 (sudo apt-get install google-perftools)
# TF 0.12 (for var.read_value(), ones_initializer())
#
# On Linux default malloc is slow
# sudo apt-get install google-perftools
# export LD_PRELOAD="/usr/lib/libtcmalloc.so.4"
@yaroslavvb
yaroslavvb / stats_summarizer_example.py
Last active February 6, 2018 09:25
Example of using stats summarizer
# export LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
import tensorflow as tf
a = tf.ones((10, 10))
b = tf.ones((10, 10))
c = tf.matmul(a, b)
ss = tf.contrib.stat_summarizer.NewStatSummarizer(tf.get_default_graph().as_graph_def().SerializeToString())
sess = tf.Session()
for i in range(10):
@yaroslavvb
yaroslavvb / sharded_ps_benchmark.py
Last active December 27, 2022 06:25
Example of local cluster with multiple workers/training loops sharded parameter server
#!/usr/bin/env python
# Benchmark transferring data, part of troubleshooting https://github.com/tensorflow/tensorflow/issues/6116
#
# Take a independent workers communicating with b parameter shards
# Each worker tries to add to variables stored on parameter server as fast as
# possible.
#
# macbook
# ps=1: 1.6 GB/s
# ps=2: 2.6 GB/s
@yaroslavvb
yaroslavvb / scratch.py
Created December 8, 2016 22:51
Example of using stats summarizer
import tensorflow as tf
a = tf.ones((10, 10))
b = tf.ones((10, 10))
c = tf.matmul(a, b)
ss = tf.contrib.stat_summarizer.NewStatSummarizer(tf.get_default_graph().as_graph_def().SerializeToString())
sess = tf.Session()
for i in range(10):
run_metadata = tf.RunMetadata()
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)