Skip to content

Instantly share code, notes, and snippets.

@annanay25
Last active July 3, 2017 12:38
Show Gist options
  • Save annanay25/9bb3226bf7976de600f4db40f47c965a to your computer and use it in GitHub Desktop.
Save annanay25/9bb3226bf7976de600f4db40f47c965a to your computer and use it in GitHub Desktop.
from __future__ import print_function
import tensorflow as tf
import numpy
import time
import os
from random import randint
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/tensorflow/mnist/input_data", one_hot=True)
# Parameters
learning_rate = 0.001
training_iters = 2000
batch_size = 128
display_step = 10
# Network Parameters
n_input = 784 # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)
dropout = 0.75 # Dropout, probability to keep units
# tf Graph input
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)
# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1):
x = tf.reshape(x, shape=[-1, 28, 28, 1])
# Conv2D wrapper, with bias and relu activation
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
x = tf.nn.bias_add(x, b)
return x
# Store layers weight & bias
weights = {
# 5x5 conv, 1 input, 32 outputs
'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32])),
}
biases = {
'bc1': tf.Variable(tf.random_normal([32])),
}
# tf Graph input
# Add placeholder with the shape [784,None]
a = tf.placeholder(tf.float32, [784,1])
c = tf.placeholder(tf.float32, [1,784])
d = tf.matmul(c,a)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
config = tf.ConfigProto(
intra_op_parallelism_threads=1,
inter_op_parallelism_threads=1
)
jit_level = tf.OptimizerOptions.ON_1
jit_scope = tf.contrib.compiler.jit.experimental_jit_scope
config.graph_options.optimizer_options.global_jit_level = jit_level
run_metadata = tf.RunMetadata()
with tf.Session(config=config) as sess:
sess.run(init)
# Keep training until reach max iterations
# while step * batch_size < training_iters:
x = mnist.train.next_batch(batch_size)
# z = tf.matmul(tf.reshape(x[0][0], [1, -1]), tf.reshape(x[0][0], [-1, 1]))
num = randint(0,10)
x = numpy.reshape(x[0][num], (1,784))
beg = time.time()
# sess.run(conv2d(x[0][0], weights['wc1'], biases['bc1']))
# print(sess.run(tf.matmul(tf.reshape(x[0][0], [1, -1]), tf.reshape(x[0][0], [-1, 1]))))
# print(sess.run(tf.multiply(tf.reshape(x[0][num], [1, -1]), 5.0)))
# print(sess.run(tf.matmul(tf.reshape(x[0][num], [1, -1]), tf.reshape(x[0][num], [-1, 1]))))
print (sess.run(d, feed_dict={c:x, a:numpy.transpose(x)}))
print (numpy.shape(d))
# sess.run(tf.multiply(x[0][0],tf.constant(5.0)))
# print(numpy.shape(tf.reshape(x[0][0], [1, -1])))
end = time.time()
# beg = time.time()
# with jit_scope():
# a = tf.matmul(x, y)
# print(a)
# end = time.time()
print(end - beg)
@DavidNorman
Copy link

try something like ( i may have the JIT enable setting slightly wrong ):

import tensorflow as tf
x = tf.placeholder(tf.float32, [2,2])
y = tf.placeholder(tf.float32, [2,2])
z = tf.matmul(x,y)

config = tf.ConfigProto(
    intra_op_parallelism_threads=1,
    inter_op_parallelism_threads=1,
    graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
)

sess=tf.Session(config=config)
print sess.run(z, feed_dict={x:[[1,0],[0,1]], y:[[2, 3],[4, 5]]})

The way I would do it is:

import tensorflow as tf

with tf.device('/device:XLA_CPU:0'):
    x = tf.placeholder(tf.float32, [2,2])
    y = tf.placeholder(tf.float32, [2,2])
    z = tf.matmul(x,y)

sess=tf.Session()
print sess.run(z, feed_dict={x:[[1,0],[0,1]], y:[[2, 3],[4, 5]]})

@annanay25
Copy link
Author

annanay25 commented Jun 29, 2017

@DavidNorman, Thanks for this!

I JIT compiled with this. This is the log -

2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:457] Graph ReCopy #nodes 6 #edges 8
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] || 
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] || () -> () {
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] ||   n2 = _Recv[client_terminated=true, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=6355887849539892017, tensor_name="Placeholder_4:0", tensor_type=float]()
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] ||   n3 = _Recv[client_terminated=true, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=6355887849539892017, tensor_name="Placeholder_3:0", tensor_type=float]()
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] ||   n4 = MatMul[T=float, transpose_a=false, transpose_b=false](n2, n3)
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] ||   n5 = _Send[T=float, client_terminated=true, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=6355887849539892017, tensor_name="MatMul:0"](n4)
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] || }
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] || 
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:311] Custom creator error: Invalid argument: No _XlaCompile for NoOp
2017-06-29 16:57:02: I tensorflow/core/framework/op_kernel.cc:858] Instantiating kernel for node: _SOURCE = NoOp[]()
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:311] Custom creator error: Invalid argument: No _XlaCompile for NoOp
2017-06-29 16:57:02: I tensorflow/core/framework/op_kernel.cc:858] Instantiating kernel for node: _SINK = NoOp[]()
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:311] Custom creator error: Invalid argument: No _XlaCompile for _Recv
2017-06-29 16:57:02: I tensorflow/core/framework/op_kernel.cc:858] Instantiating kernel for node: _recv_Placeholder_4_0 = _Recv[client_terminated=true, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=6355887849539892017, tensor_name="Placeholder_4:0", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:311] Custom creator error: Invalid argument: No _XlaCompile for _Recv
2017-06-29 16:57:02: I tensorflow/core/framework/op_kernel.cc:858] Instantiating kernel for node: _recv_Placeholder_3_0 = _Recv[client_terminated=true, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=6355887849539892017, tensor_name="Placeholder_3:0", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:311] Custom creator error: Invalid argument: No _XlaCompile for MatMul
2017-06-29 16:57:02: I tensorflow/core/framework/op_kernel.cc:858] Instantiating kernel for node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_Placeholder_4_0, _recv_Placeholder_3_0)
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:311] Custom creator error: Invalid argument: No _XlaCompile for _Send
2017-06-29 16:57:02: I tensorflow/core/framework/op_kernel.cc:858] Instantiating kernel for node: _send_MatMul_0 = _Send[T=DT_FLOAT, client_terminated=true, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=6355887849539892017, tensor_name="MatMul:0", _device="/job:localhost/replica:0/task:0/cpu:0"](MatMul)
2017-06-29 16:57:02: I tensorflow/core/common_runtime/executor.cc:477] default alloc case local type CPU remote type CPU
2017-06-29 16:57:02: I tensorflow/core/common_runtime/executor.cc:477] default alloc case local type CPU remote type CPU
2017-06-29 16:57:02: I tensorflow/core/common_runtime/executor.cc:504] default alloc case local type CPU remote type CPU

It says No _XlaCompile for MatMul :(

Edit - Updated Gist.

@DavidNorman
Copy link

Weird. When I run the second one of those 2 code fragments (I can't run the first one easily because my own device is higher priority than the CPU), then I get this:

2017-06-30 07:16:55.377485: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309] HloModule cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=0,_XlaNumResourceArgs=0]_module:
2017-06-30 07:16:55.377490: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309] 
2017-06-30 07:16:55.377494: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309] ENTRY cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=0,_XlaNumResourceArgs=0].v5 (arg0: f32[2,2], arg1: f32[2,2]) -> f32[2,2] { 
2017-06-30 07:16:55.377498: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309]   %arg0 = f32[2,2]{1,0} parameter(0)
2017-06-30 07:16:55.377502: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309]   %arg1 = f32[2,2]{1,0} parameter(1)
2017-06-30 07:16:55.377506: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309]   %dot = f32[2,2]{1,0} dot(f32[2,2]{1,0} %arg0, f32[2,2]{1,0} %arg1) # metadata=op_type: "MatMul" op_name: "MatMul"
2017-06-30 07:16:55.377510: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309] }
2017-06-30 07:16:55.377513: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309] 
.
.
.
2017-06-30 07:16:55.378781: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:752] HandleDot: 
2017-06-30 07:16:55.381238: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:753]   lhs operand:   %2 = bitcast i8* %1 to [2 x [2 x float]]*
2017-06-30 07:16:55.381255: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:755]   rhs operand:   %5 = bitcast i8* %4 to [2 x [2 x float]]*
2017-06-30 07:16:55.381264: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:757]   target:   %6 = bitcast i8* %retval to [2 x [2 x float]]*
2017-06-30 07:16:55.381309: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:1464] FinishVisit root: %dot = f32[2,2]{1,0} dot(f32[2,2]{1,0} %arg0, f32[2,2]{1,0} %arg1) # metadata=op_type: "MatMul" op_name: "MatMul"
.
.
.
2017-06-30 07:16:55.382122: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] define void @"cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=0,_XlaNumResourceArgs=0].v5"(i8* align 16 dereferenceable(16) %retval, i8* noalias %run_options, i8** noalias %params, i8** noalias %temps, i64* noalias %prof_counters) {
2017-06-30 07:16:55.382125: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] entry:
2017-06-30 07:16:55.382157: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]   %0 = getelementptr inbounds i8*, i8** %params, i64 0
2017-06-30 07:16:55.382170: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]   %1 = load i8*, i8** %0, !tbaa !0, !dereferenceable !3, !align !3
2017-06-30 07:16:55.382174: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]   %2 = bitcast i8* %1 to [2 x [2 x float]]*
2017-06-30 07:16:55.382188: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]   %3 = getelementptr inbounds i8*, i8** %params, i64 1
2017-06-30 07:16:55.382192: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]   %4 = load i8*, i8** %3, !tbaa !0, !dereferenceable !3, !align !3
2017-06-30 07:16:55.382195: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]   %5 = bitcast i8* %4 to [2 x [2 x float]]*
2017-06-30 07:16:55.382199: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]   %6 = bitcast i8* %retval to [2 x [2 x float]]*
2017-06-30 07:16:55.382310: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]   %7 = bitcast [2 x [2 x float]]* %6 to float*
2017-06-30 07:16:55.382319: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]   %8 = bitcast [2 x [2 x float]]* %5 to float*
2017-06-30 07:16:55.382334: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]   %9 = bitcast [2 x [2 x float]]* %2 to float*
2017-06-30 07:16:55.382338: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]   call void @__xla_cpu_runtime_EigenMatMulF32(i8* %run_options, float* %7, float* %8, float* %9, i64 2, i64 2, i64 2, i32 0, i32 0)
2017-06-30 07:16:55.382343: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]   %prof_counter_computation = getelementptr i64, i64* %prof_counters, i64 0
2017-06-30 07:16:55.382346: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]   ret void
2017-06-30 07:16:55.382350: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] }
2017-06-30 07:16:55.382352: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] 
2017-06-30 07:16:55.382355: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] ; Function Attrs: argmemonly nounwind
2017-06-30 07:16:55.382359: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] declare void @__xla_cpu_runtime_EigenMatMulF32(i8*, float*, float*, float*, i64, i64, i64, i32, i32) #0
2017-06-30 07:16:55.382363: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] 
2017-06-30 07:16:55.382366: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] attributes #0 = { argmemonly nounwind }
2017-06-30 07:16:55.382378: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] 
2017-06-30 07:16:55.382381: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] !0 = !{!1, !1, i64 0}
2017-06-30 07:16:55.382385: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] !1 = !{!"pointer-to element_type: F32 dimensions: 2 dimensions: 2 layout { minor_to_major: 1 minor_to_major: 0 }", !2}
2017-06-30 07:16:55.382513: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] !2 = !{!"XLA TBAA"}
2017-06-30 07:16:55.382533: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] !3 = !{i64 16}
2017-06-30 07:16:55.385209: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] _cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=0,_XlaNumResourceArgs=0].v5:
2017-06-30 07:16:55.385224: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000000 	pushq	%r14
2017-06-30 07:16:55.385229: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000002 	pushq	%rbx
2017-06-30 07:16:55.385232: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000003 	subq	$72, %rsp
2017-06-30 07:16:55.385236: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000007 	movl	$2, %eax
2017-06-30 07:16:55.385240: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000000c 	movl	%eax, %r9d
2017-06-30 07:16:55.385244: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000000f 	xorl	%eax, %eax
2017-06-30 07:16:55.385248: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000011 	movq	(%rdx), %r10
2017-06-30 07:16:55.385251: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000014 	movq	8(%rdx), %rdx
2017-06-30 07:16:55.385255: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000018 	movq	%rsp, %r11
2017-06-30 07:16:55.385258: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000001b 	movl	$0, 16(%r11)
2017-06-30 07:16:55.385262: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000023 	movl	$0, 8(%r11)
2017-06-30 07:16:55.385266: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000002b 	movq	$2, (%r11)
2017-06-30 07:16:55.385269: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000032 	movabsq	$0, %r11
2017-06-30 07:16:55.385273: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000003c 	movl	$2, %ebx
2017-06-30 07:16:55.385277: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000041 	movl	%ebx, %r14d
2017-06-30 07:16:55.385280: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000044 	movq	%rdi, 64(%rsp)
2017-06-30 07:16:55.385284: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000049 	movq	%rsi, %rdi
2017-06-30 07:16:55.385288: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000004c 	movq	64(%rsp), %rsi
2017-06-30 07:16:55.385292: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000051 	movq	%rcx, 56(%rsp)
2017-06-30 07:16:55.385295: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000056 	movq	%r10, %rcx
2017-06-30 07:16:55.385299: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000059 	movq	%r8, 48(%rsp)
2017-06-30 07:16:55.385303: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000005e 	movq	%r14, %r8
2017-06-30 07:16:55.385306: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000061 	movq	%r9, 40(%rsp)
2017-06-30 07:16:55.385310: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000066 	movq	%r14, %r9
2017-06-30 07:16:55.385314: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000069 	movl	%eax, 36(%rsp)
2017-06-30 07:16:55.385317: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000006d 	callq	*%r11
2017-06-30 07:16:55.385321: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000070 	addq	$72, %rsp
2017-06-30 07:16:55.385325: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000074 	popq	%rbx
2017-06-30 07:16:55.385328: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000075 	popq	%r14
2017-06-30 07:16:55.385331: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000077 	retq

Are you trying to run the code on a version of Tensorflow built with the XLA options set? Perhaps try on a fresh clean repository with the XLA configuration?

@DavidNorman
Copy link

I don't know what your goal is here, so maybe this isn't helpful.

However, perhaps it is to experiment with different implementations of MatMul. If so, as you can see above, matmul is implemented as a call to a library function, so you could just replace the internals of that library function. It is called __xla_cpu_runtime_EigenSingleThreadedMatMulF32, in the file tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc. It calls a template function which makes some Eigen vector dot operations. You could replace the __xla_cpu_runtime_EigenSingleThreadedMatMulF32 with whatever C code you like, or some inline assembler.

@annanay25
Copy link
Author

annanay25 commented Jun 30, 2017

My aim is actually to obtain LLVM IR for the matmul operation in XLA.

As you have pointed out -. call void @__xla_cpu_runtime_EigenMatMulF32 . If that could be replaced with the non-eigen implementation of matmul in LLVM IR (I did the same with Conv operator. Here - link - you can see the if (PotentiallyImplementedAsEigenConvolution(*convolution)) { which I negated to always fallback to the non-eigen implementation - this helped me get LLVM IR for convolution)

I wish to do the same and get LLVM IR for the matmul operator.

Thanks again @DavidNorman

@DavidNorman
Copy link

DavidNorman commented Jun 30, 2017

i see. somewhere there must be the code which sets up the stack frame and emits the code seen in the trace (this stuff):

2017-06-30 07:16:55.385224: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000000 	pushq	%r14
2017-06-30 07:16:55.385229: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000002 	pushq	%rbx
2017-06-30 07:16:55.385232: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000003 	subq	$72, %rsp
2017-06-30 07:16:55.385236: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000007 	movl	$2, %eax
2017-06-30 07:16:55.385240: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000000c 	movl	%eax, %r9d
2017-06-30 07:16:55.385244: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000000f 	xorl	%eax, %eax
2017-06-30 07:16:55.385248: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000011 	movq	(%rdx), %r10
2017-06-30 07:16:55.385251: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000014 	movq	8(%rdx), %rdx
2017-06-30 07:16:55.385255: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000018 	movq	%rsp, %r11
2017-06-30 07:16:55.385258: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000001b 	movl	$0, 16(%r11)
2017-06-30 07:16:55.385262: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000023 	movl	$0, 8(%r11)
2017-06-30 07:16:55.385266: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000002b 	movq	$2, (%r11)
2017-06-30 07:16:55.385269: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000032 	movabsq	$0, %r11
2017-06-30 07:16:55.385273: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000003c 	movl	$2, %ebx
2017-06-30 07:16:55.385277: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000041 	movl	%ebx, %r14d
2017-06-30 07:16:55.385280: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000044 	movq	%rdi, 64(%rsp)
2017-06-30 07:16:55.385284: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000049 	movq	%rsi, %rdi
2017-06-30 07:16:55.385288: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000004c 	movq	64(%rsp), %rsi
2017-06-30 07:16:55.385292: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000051 	movq	%rcx, 56(%rsp)
2017-06-30 07:16:55.385295: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000056 	movq	%r10, %rcx
2017-06-30 07:16:55.385299: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000059 	movq	%r8, 48(%rsp)
2017-06-30 07:16:55.385303: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000005e 	movq	%r14, %r8
2017-06-30 07:16:55.385306: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000061 	movq	%r9, 40(%rsp)
2017-06-30 07:16:55.385310: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000066 	movq	%r14, %r9
2017-06-30 07:16:55.385314: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000069 	movl	%eax, 36(%rsp)
2017-06-30 07:16:55.385317: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000006d 	callq	*%r11
2017-06-30 07:16:55.385321: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000070 	addq	$72, %rsp
2017-06-30 07:16:55.385325: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000074 	popq	%rbx
2017-06-30 07:16:55.385328: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000075 	popq	%r14
2017-06-30 07:16:55.385331: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000077 	retq

I'm guessing this is written out using LLVM, so maybe you can find that and then add some alternative implementation. See DotOpEmitter::Emit in tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc.

@annanay25
Copy link
Author

@DavidNorman +1.

I changed

if (PotentiallyImplementedAsEigenDot(dot_)) {
    return EmitCallToRuntime();
  }

to

if (PotentiallyImplementedAsEigenDot(dot_)  && 0 ) {
    return EmitCallToRuntime();
  }

Still no XLA computation for matmul. Something else is wrong.
Could you change it in your codebase here and let me know if it works?

Thanks!

@DavidNorman
Copy link

it is working for me.

are you definitely using a version of tensorflow that has XLA built in? You are if you are targeting a device called /device:XLA_CPU:0

2017-07-03 13:36:20.455473: I tensorflow/compiler/xla/service/hlo_instruction.cc:2097] Visiting HLO %dot
2017-07-03 13:36:20.455509: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:807] HandleDot: 
2017-07-03 13:36:20.455519: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:808]   lhs operand:   %2 = bitcast i8* %1 to [2 x [2 x float]]*
2017-07-03 13:36:20.455527: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:810]   rhs operand:   %5 = bitcast i8* %4 to [2 x [2 x float]]*
2017-07-03 13:36:20.455533: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:812]   target:   %6 = bitcast i8* %retval to [2 x [2 x float]]*
2017-07-03 13:36:20.455623: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:1746] FinishVisit root: %dot = f32[2,2]{1,0} dot(f32[2,2]{1,0} %arg0, f32[2,2]{1,0} %arg1) # metadata=op_type: "MatMul" op_name: "MatMul"
2017-07-03 13:36:20.455638: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:1748]   value:   %6 = bitcast i8* %retval to [2 x [2 x float]]*
2017-07-03 13:36:20.455662: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:64] IR before optimizations
2017-07-03 13:36:20.455735: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] ; ModuleID = '__compute_module'
2017-07-03 13:36:20.455740: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] source_filename = "__compute_module"
2017-07-03 13:36:20.455745: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
2017-07-03 13:36:20.455748: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] target triple = "x86_64-apple-darwin"
2017-07-03 13:36:20.455752: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] 
2017-07-03 13:36:20.455756: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] define void @"cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=0,_XlaNumResourceArgs=0].v5"(i8* align 16 dereferenceable(16) %retval, i8* noalias %run_options, i8** noalias %params, i8** noalias %temps, i64* noalias %prof_counters) {
2017-07-03 13:36:20.455761: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] entry:
2017-07-03 13:36:20.455764: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %accum_address = alloca float
2017-07-03 13:36:20.455768: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %invar_address.reduction = alloca i64
2017-07-03 13:36:20.455771: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %invar_address.rhs1 = alloca i64
2017-07-03 13:36:20.455775: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %invar_address.lhs0 = alloca i64
2017-07-03 13:36:20.455779: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %0 = getelementptr inbounds i8*, i8** %params, i64 0
2017-07-03 13:36:20.455783: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %1 = load i8*, i8** %0, !dereferenceable !0, !align !0
2017-07-03 13:36:20.455787: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %2 = bitcast i8* %1 to [2 x [2 x float]]*
2017-07-03 13:36:20.455791: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %3 = getelementptr inbounds i8*, i8** %params, i64 1
2017-07-03 13:36:20.455795: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %4 = load i8*, i8** %3, !dereferenceable !0, !align !0
2017-07-03 13:36:20.455799: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %5 = bitcast i8* %4 to [2 x [2 x float]]*
2017-07-03 13:36:20.455803: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %6 = bitcast i8* %retval to [2 x [2 x float]]*
2017-07-03 13:36:20.455808: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   store i64 0, i64* %invar_address.lhs0
2017-07-03 13:36:20.455812: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   br label %loop_header.lhs0
2017-07-03 13:36:20.455815: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] 
2017-07-03 13:36:20.455819: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_header.lhs0:                                 ; preds = %loop_exit.rhs1, %entry
2017-07-03 13:36:20.455823: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %indvar.lhs0 = load i64, i64* %invar_address.lhs0
2017-07-03 13:36:20.455827: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %7 = icmp uge i64 %indvar.lhs0, 2
2017-07-03 13:36:20.455832: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   br i1 %7, label %loop_exit.lhs0, label %loop_body.lhs0
2017-07-03 13:36:20.455835: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] 
2017-07-03 13:36:20.455839: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_body.lhs0:                                   ; preds = %loop_header.lhs0
2017-07-03 13:36:20.455843: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   store i64 0, i64* %invar_address.rhs1
2017-07-03 13:36:20.455847: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   br label %loop_header.rhs1
2017-07-03 13:36:20.455851: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] 
2017-07-03 13:36:20.455855: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_header.rhs1:                                 ; preds = %loop_exit.reduction, %loop_body.lhs0
2017-07-03 13:36:20.455860: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %indvar.rhs1 = load i64, i64* %invar_address.rhs1
2017-07-03 13:36:20.455864: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %8 = icmp uge i64 %indvar.rhs1, 2
2017-07-03 13:36:20.455868: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   br i1 %8, label %loop_exit.rhs1, label %loop_body.rhs1
2017-07-03 13:36:20.455872: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] 
2017-07-03 13:36:20.455876: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_body.rhs1:                                   ; preds = %loop_header.rhs1
2017-07-03 13:36:20.455880: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   store i64 0, i64* %invar_address.reduction
2017-07-03 13:36:20.455884: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   store float 0.000000e+00, float* %accum_address
2017-07-03 13:36:20.455897: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   br label %loop_header.reduction
2017-07-03 13:36:20.455900: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] 
2017-07-03 13:36:20.455904: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_header.reduction:                            ; preds = %loop_body.reduction, %loop_body.rhs1
2017-07-03 13:36:20.455908: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %indvar.reduction = load i64, i64* %invar_address.reduction
2017-07-03 13:36:20.455912: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %9 = icmp uge i64 %indvar.reduction, 2
2017-07-03 13:36:20.455916: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   br i1 %9, label %loop_exit.reduction, label %loop_body.reduction
2017-07-03 13:36:20.455919: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] 
2017-07-03 13:36:20.455923: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_body.reduction:                              ; preds = %loop_header.reduction
2017-07-03 13:36:20.455927: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %10 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %2, i64 0, i64 %indvar.lhs0, i64 %indvar.reduction
2017-07-03 13:36:20.455932: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %11 = load float, float* %10, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.455937: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %12 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %5, i64 0, i64 %indvar.reduction, i64 %indvar.rhs1
2017-07-03 13:36:20.455941: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %13 = load float, float* %12, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.455945: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %14 = fmul fast float %11, %13
2017-07-03 13:36:20.455949: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %15 = load float, float* %accum_address
2017-07-03 13:36:20.455952: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %16 = fadd fast float %15, %14
2017-07-03 13:36:20.455956: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   store float %16, float* %accum_address
2017-07-03 13:36:20.455960: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %invar.inc2 = add nuw nsw i64 %indvar.reduction, 1
2017-07-03 13:36:20.455964: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   store i64 %invar.inc2, i64* %invar_address.reduction
2017-07-03 13:36:20.455968: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   br label %loop_header.reduction
2017-07-03 13:36:20.455971: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] 
2017-07-03 13:36:20.455975: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_exit.reduction:                              ; preds = %loop_header.reduction
2017-07-03 13:36:20.455978: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %17 = load float, float* %accum_address
2017-07-03 13:36:20.455983: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %18 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %6, i64 0, i64 %indvar.lhs0, i64 %indvar.rhs1
2017-07-03 13:36:20.455987: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   store float %17, float* %18, !alias.scope !2
2017-07-03 13:36:20.455991: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %invar.inc1 = add nuw nsw i64 %indvar.rhs1, 1
2017-07-03 13:36:20.455994: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   store i64 %invar.inc1, i64* %invar_address.rhs1
2017-07-03 13:36:20.455998: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   br label %loop_header.rhs1
2017-07-03 13:36:20.456002: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] 
2017-07-03 13:36:20.456005: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_exit.rhs1:                                   ; preds = %loop_header.rhs1
2017-07-03 13:36:20.456009: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %invar.inc = add nuw nsw i64 %indvar.lhs0, 1
2017-07-03 13:36:20.456013: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   store i64 %invar.inc, i64* %invar_address.lhs0
2017-07-03 13:36:20.456017: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   br label %loop_header.lhs0
2017-07-03 13:36:20.456021: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] 
2017-07-03 13:36:20.456024: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_exit.lhs0:                                   ; preds = %loop_header.lhs0
2017-07-03 13:36:20.456028: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   %prof_counter_computation = getelementptr i64, i64* %prof_counters, i64 0
2017-07-03 13:36:20.456032: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]   ret void
2017-07-03 13:36:20.456035: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] }
2017-07-03 13:36:20.456038: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] 
2017-07-03 13:36:20.456041: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] !0 = !{i64 16}
2017-07-03 13:36:20.456044: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] !1 = !{}
2017-07-03 13:36:20.456047: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] !2 = !{!3}
2017-07-03 13:36:20.456051: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] !3 = !{!"buffer: {index:0, offset:0, size:16}", !4}
2017-07-03 13:36:20.456054: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] !4 = distinct !{!4}
2017-07-03 13:36:20.461396: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:91] IR after optimizations
2017-07-03 13:36:20.461468: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] ; ModuleID = '__compute_module'
2017-07-03 13:36:20.461475: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] source_filename = "__compute_module"
2017-07-03 13:36:20.461479: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
2017-07-03 13:36:20.461482: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] target triple = "x86_64-apple-darwin"
2017-07-03 13:36:20.461497: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] 
2017-07-03 13:36:20.461500: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] ; Function Attrs: norecurse nounwind
2017-07-03 13:36:20.461505: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] define void @"cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=0,_XlaNumResourceArgs=0].v5"(i8* nocapture align 16 dereferenceable(16) %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readonly %params, i8** noalias nocapture readnone %temps, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
2017-07-03 13:36:20.461509: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] entry:
2017-07-03 13:36:20.461513: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %0 = bitcast i8** %params to [2 x [2 x float]]**
2017-07-03 13:36:20.461516: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %1 = load [2 x [2 x float]]*, [2 x [2 x float]]** %0, align 8, !dereferenceable !0, !align !0
2017-07-03 13:36:20.461520: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %2 = getelementptr inbounds i8*, i8** %params, i64 1
2017-07-03 13:36:20.461524: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %3 = bitcast i8** %2 to [2 x [2 x float]]**
2017-07-03 13:36:20.461527: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %4 = load [2 x [2 x float]]*, [2 x [2 x float]]** %3, align 8, !dereferenceable !0, !align !0
2017-07-03 13:36:20.461541: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %5 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %1, i64 0, i64 0, i64 0
2017-07-03 13:36:20.461638: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %6 = load float, float* %5, align 16, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461659: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %7 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %4, i64 0, i64 0, i64 0
2017-07-03 13:36:20.461663: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %8 = load float, float* %7, align 16, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461667: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %9 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %1, i64 0, i64 0, i64 1
2017-07-03 13:36:20.461671: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %10 = load float, float* %9, align 4, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461674: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %11 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %4, i64 0, i64 1, i64 0
2017-07-03 13:36:20.461678: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %12 = load float, float* %11, align 8, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461682: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %13 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %4, i64 0, i64 0, i64 1
2017-07-03 13:36:20.461685: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %14 = load float, float* %13, align 4, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461689: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %15 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %4, i64 0, i64 1, i64 1
2017-07-03 13:36:20.461693: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %16 = load float, float* %15, align 4, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461828: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %17 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %1, i64 0, i64 1, i64 0
2017-07-03 13:36:20.461849: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %18 = load float, float* %17, align 8, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461854: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %19 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %1, i64 0, i64 1, i64 1
2017-07-03 13:36:20.461858: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %20 = load float, float* %19, align 4, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461863: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %21 = insertelement <4 x float> undef, float %8, i32 0
2017-07-03 13:36:20.461867: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %22 = insertelement <4 x float> %21, float %14, i32 1
2017-07-03 13:36:20.461871: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %23 = insertelement <4 x float> %22, float %8, i32 2
2017-07-03 13:36:20.461875: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %24 = insertelement <4 x float> %23, float %14, i32 3
2017-07-03 13:36:20.461878: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %25 = insertelement <4 x float> undef, float %6, i32 0
2017-07-03 13:36:20.461882: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %26 = insertelement <4 x float> %25, float %6, i32 1
2017-07-03 13:36:20.461896: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %27 = insertelement <4 x float> %26, float %18, i32 2
2017-07-03 13:36:20.461992: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %28 = insertelement <4 x float> %27, float %18, i32 3
2017-07-03 13:36:20.462012: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %29 = fmul fast <4 x float> %24, %28
2017-07-03 13:36:20.462017: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %30 = insertelement <4 x float> undef, float %12, i32 0
2017-07-03 13:36:20.462021: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %31 = insertelement <4 x float> %30, float %16, i32 1
2017-07-03 13:36:20.462024: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %32 = insertelement <4 x float> %31, float %12, i32 2
2017-07-03 13:36:20.462028: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %33 = insertelement <4 x float> %32, float %16, i32 3
2017-07-03 13:36:20.462032: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %34 = insertelement <4 x float> undef, float %10, i32 0
2017-07-03 13:36:20.462036: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %35 = insertelement <4 x float> %34, float %10, i32 1
2017-07-03 13:36:20.462040: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %36 = insertelement <4 x float> %35, float %20, i32 2
2017-07-03 13:36:20.462044: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %37 = insertelement <4 x float> %36, float %20, i32 3
2017-07-03 13:36:20.462048: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %38 = fmul fast <4 x float> %33, %37
2017-07-03 13:36:20.462061: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %39 = fadd fast <4 x float> %38, %29
2017-07-03 13:36:20.462064: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   %40 = bitcast i8* %retval to <4 x float>*
2017-07-03 13:36:20.462168: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   store <4 x float> %39, <4 x float>* %40, align 16, !alias.scope !2
2017-07-03 13:36:20.462177: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]   ret void
2017-07-03 13:36:20.462191: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] }
2017-07-03 13:36:20.462194: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] 
2017-07-03 13:36:20.462197: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] attributes #0 = { norecurse nounwind }
2017-07-03 13:36:20.462201: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] 
2017-07-03 13:36:20.462203: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] !0 = !{i64 16}
2017-07-03 13:36:20.462206: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] !1 = !{}
2017-07-03 13:36:20.462209: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] !2 = !{!3}
2017-07-03 13:36:20.462212: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] !3 = !{!"buffer: {index:0, offset:0, size:16}", !4}
2017-07-03 13:36:20.462216: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] !4 = distinct !{!4}
2017-07-03 13:36:20.465761: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] _cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=0,_XlaNumResourceArgs=0].v5:
2017-07-03 13:36:20.465777: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000000 	movq	(%rdx), %rax
2017-07-03 13:36:20.465783: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000003 	movq	8(%rdx), %rcx
2017-07-03 13:36:20.465787: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000007 	vmovsd	(%rcx), %xmm0
2017-07-03 13:36:20.465791: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x0000000b 	vmovsd	8(%rcx), %xmm1
2017-07-03 13:36:20.465795: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000010 	vinsertps	$32, (%rcx), %xmm0, %xmm0
2017-07-03 13:36:20.465799: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000016 	vinsertps	$48, 4(%rcx), %xmm0, %xmm0
2017-07-03 13:36:20.465803: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x0000001d 	vmovss	8(%rax), %xmm2
2017-07-03 13:36:20.465808: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000022 	vmovss	(%rax), %xmm3
2017-07-03 13:36:20.465811: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000026 	vmovss	4(%rax), %xmm4
2017-07-03 13:36:20.465815: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x0000002b 	vshufps	$0, %xmm2, %xmm3, %xmm2
2017-07-03 13:36:20.465819: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000030 	vmulps	%xmm2, %xmm0, %xmm0
2017-07-03 13:36:20.465822: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000034 	vinsertps	$32, 8(%rcx), %xmm1, %xmm1
2017-07-03 13:36:20.465826: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x0000003b 	vinsertps	$48, 12(%rcx), %xmm1, %xmm1
2017-07-03 13:36:20.465830: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000042 	vmovss	12(%rax), %xmm2
2017-07-03 13:36:20.465834: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000047 	vshufps	$0, %xmm2, %xmm4, %xmm2
2017-07-03 13:36:20.465837: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x0000004c 	vfmadd213ps	%xmm0, %xmm1, %xmm2
2017-07-03 13:36:20.465841: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000051 	vmovaps	%xmm2, (%rdi)
2017-07-03 13:36:20.465844: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000055 	retq

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment