-
-
Save annanay25/9bb3226bf7976de600f4db40f47c965a to your computer and use it in GitHub Desktop.
from __future__ import print_function | |
import tensorflow as tf | |
import numpy | |
import time | |
import os | |
from random import randint | |
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' | |
# Import MNIST data | |
from tensorflow.examples.tutorials.mnist import input_data | |
mnist = input_data.read_data_sets("/tmp/tensorflow/mnist/input_data", one_hot=True) | |
# Parameters | |
learning_rate = 0.001 | |
training_iters = 2000 | |
batch_size = 128 | |
display_step = 10 | |
# Network Parameters | |
n_input = 784 # MNIST data input (img shape: 28*28) | |
n_classes = 10 # MNIST total classes (0-9 digits) | |
dropout = 0.75 # Dropout, probability to keep units | |
# tf Graph input | |
x = tf.placeholder(tf.float32, [None, n_input]) | |
y = tf.placeholder(tf.float32, [None, n_classes]) | |
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability) | |
# Create some wrappers for simplicity | |
def conv2d(x, W, b, strides=1): | |
x = tf.reshape(x, shape=[-1, 28, 28, 1]) | |
# Conv2D wrapper, with bias and relu activation | |
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME') | |
x = tf.nn.bias_add(x, b) | |
return x | |
# Store layers weight & bias | |
weights = { | |
# 5x5 conv, 1 input, 32 outputs | |
'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32])), | |
} | |
biases = { | |
'bc1': tf.Variable(tf.random_normal([32])), | |
} | |
# tf Graph input | |
# Add placeholder with the shape [784,None] | |
a = tf.placeholder(tf.float32, [784,1]) | |
c = tf.placeholder(tf.float32, [1,784]) | |
d = tf.matmul(c,a) | |
# Initializing the variables | |
init = tf.global_variables_initializer() | |
# Launch the graph | |
config = tf.ConfigProto( | |
intra_op_parallelism_threads=1, | |
inter_op_parallelism_threads=1 | |
) | |
jit_level = tf.OptimizerOptions.ON_1 | |
jit_scope = tf.contrib.compiler.jit.experimental_jit_scope | |
config.graph_options.optimizer_options.global_jit_level = jit_level | |
run_metadata = tf.RunMetadata() | |
with tf.Session(config=config) as sess: | |
sess.run(init) | |
# Keep training until reach max iterations | |
# while step * batch_size < training_iters: | |
x = mnist.train.next_batch(batch_size) | |
# z = tf.matmul(tf.reshape(x[0][0], [1, -1]), tf.reshape(x[0][0], [-1, 1])) | |
num = randint(0,10) | |
x = numpy.reshape(x[0][num], (1,784)) | |
beg = time.time() | |
# sess.run(conv2d(x[0][0], weights['wc1'], biases['bc1'])) | |
# print(sess.run(tf.matmul(tf.reshape(x[0][0], [1, -1]), tf.reshape(x[0][0], [-1, 1])))) | |
# print(sess.run(tf.multiply(tf.reshape(x[0][num], [1, -1]), 5.0))) | |
# print(sess.run(tf.matmul(tf.reshape(x[0][num], [1, -1]), tf.reshape(x[0][num], [-1, 1])))) | |
print (sess.run(d, feed_dict={c:x, a:numpy.transpose(x)})) | |
print (numpy.shape(d)) | |
# sess.run(tf.multiply(x[0][0],tf.constant(5.0))) | |
# print(numpy.shape(tf.reshape(x[0][0], [1, -1]))) | |
end = time.time() | |
# beg = time.time() | |
# with jit_scope(): | |
# a = tf.matmul(x, y) | |
# print(a) | |
# end = time.time() | |
print(end - beg) | |
@DavidNorman, Thanks for this!
I JIT compiled with this. This is the log -
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:457] Graph ReCopy #nodes 6 #edges 8
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] ||
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] || () -> () {
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] || n2 = _Recv[client_terminated=true, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=6355887849539892017, tensor_name="Placeholder_4:0", tensor_type=float]()
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] || n3 = _Recv[client_terminated=true, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=6355887849539892017, tensor_name="Placeholder_3:0", tensor_type=float]()
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] || n4 = MatMul[T=float, transpose_a=false, transpose_b=false](n2, n3)
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] || n5 = _Send[T=float, client_terminated=true, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=6355887849539892017, tensor_name="MatMul:0"](n4)
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] || }
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:461] ||
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:311] Custom creator error: Invalid argument: No _XlaCompile for NoOp
2017-06-29 16:57:02: I tensorflow/core/framework/op_kernel.cc:858] Instantiating kernel for node: _SOURCE = NoOp[]()
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:311] Custom creator error: Invalid argument: No _XlaCompile for NoOp
2017-06-29 16:57:02: I tensorflow/core/framework/op_kernel.cc:858] Instantiating kernel for node: _SINK = NoOp[]()
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:311] Custom creator error: Invalid argument: No _XlaCompile for _Recv
2017-06-29 16:57:02: I tensorflow/core/framework/op_kernel.cc:858] Instantiating kernel for node: _recv_Placeholder_4_0 = _Recv[client_terminated=true, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=6355887849539892017, tensor_name="Placeholder_4:0", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:311] Custom creator error: Invalid argument: No _XlaCompile for _Recv
2017-06-29 16:57:02: I tensorflow/core/framework/op_kernel.cc:858] Instantiating kernel for node: _recv_Placeholder_3_0 = _Recv[client_terminated=true, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=6355887849539892017, tensor_name="Placeholder_3:0", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:311] Custom creator error: Invalid argument: No _XlaCompile for MatMul
2017-06-29 16:57:02: I tensorflow/core/framework/op_kernel.cc:858] Instantiating kernel for node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_Placeholder_4_0, _recv_Placeholder_3_0)
2017-06-29 16:57:02: I tensorflow/core/common_runtime/function.cc:311] Custom creator error: Invalid argument: No _XlaCompile for _Send
2017-06-29 16:57:02: I tensorflow/core/framework/op_kernel.cc:858] Instantiating kernel for node: _send_MatMul_0 = _Send[T=DT_FLOAT, client_terminated=true, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=6355887849539892017, tensor_name="MatMul:0", _device="/job:localhost/replica:0/task:0/cpu:0"](MatMul)
2017-06-29 16:57:02: I tensorflow/core/common_runtime/executor.cc:477] default alloc case local type CPU remote type CPU
2017-06-29 16:57:02: I tensorflow/core/common_runtime/executor.cc:477] default alloc case local type CPU remote type CPU
2017-06-29 16:57:02: I tensorflow/core/common_runtime/executor.cc:504] default alloc case local type CPU remote type CPU
It says No _XlaCompile for MatMul
:(
Edit - Updated Gist.
Weird. When I run the second one of those 2 code fragments (I can't run the first one easily because my own device is higher priority than the CPU), then I get this:
2017-06-30 07:16:55.377485: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309] HloModule cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=0,_XlaNumResourceArgs=0]_module:
2017-06-30 07:16:55.377490: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309]
2017-06-30 07:16:55.377494: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309] ENTRY cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=0,_XlaNumResourceArgs=0].v5 (arg0: f32[2,2], arg1: f32[2,2]) -> f32[2,2] {
2017-06-30 07:16:55.377498: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309] %arg0 = f32[2,2]{1,0} parameter(0)
2017-06-30 07:16:55.377502: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309] %arg1 = f32[2,2]{1,0} parameter(1)
2017-06-30 07:16:55.377506: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309] %dot = f32[2,2]{1,0} dot(f32[2,2]{1,0} %arg0, f32[2,2]{1,0} %arg1) # metadata=op_type: "MatMul" op_name: "MatMul"
2017-06-30 07:16:55.377510: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309] }
2017-06-30 07:16:55.377513: I tensorflow/compiler/xla/service/buffer_assignment.cc:1309]
.
.
.
2017-06-30 07:16:55.378781: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:752] HandleDot:
2017-06-30 07:16:55.381238: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:753] lhs operand: %2 = bitcast i8* %1 to [2 x [2 x float]]*
2017-06-30 07:16:55.381255: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:755] rhs operand: %5 = bitcast i8* %4 to [2 x [2 x float]]*
2017-06-30 07:16:55.381264: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:757] target: %6 = bitcast i8* %retval to [2 x [2 x float]]*
2017-06-30 07:16:55.381309: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:1464] FinishVisit root: %dot = f32[2,2]{1,0} dot(f32[2,2]{1,0} %arg0, f32[2,2]{1,0} %arg1) # metadata=op_type: "MatMul" op_name: "MatMul"
.
.
.
2017-06-30 07:16:55.382122: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] define void @"cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=0,_XlaNumResourceArgs=0].v5"(i8* align 16 dereferenceable(16) %retval, i8* noalias %run_options, i8** noalias %params, i8** noalias %temps, i64* noalias %prof_counters) {
2017-06-30 07:16:55.382125: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] entry:
2017-06-30 07:16:55.382157: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] %0 = getelementptr inbounds i8*, i8** %params, i64 0
2017-06-30 07:16:55.382170: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] %1 = load i8*, i8** %0, !tbaa !0, !dereferenceable !3, !align !3
2017-06-30 07:16:55.382174: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] %2 = bitcast i8* %1 to [2 x [2 x float]]*
2017-06-30 07:16:55.382188: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] %3 = getelementptr inbounds i8*, i8** %params, i64 1
2017-06-30 07:16:55.382192: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] %4 = load i8*, i8** %3, !tbaa !0, !dereferenceable !3, !align !3
2017-06-30 07:16:55.382195: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] %5 = bitcast i8* %4 to [2 x [2 x float]]*
2017-06-30 07:16:55.382199: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] %6 = bitcast i8* %retval to [2 x [2 x float]]*
2017-06-30 07:16:55.382310: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] %7 = bitcast [2 x [2 x float]]* %6 to float*
2017-06-30 07:16:55.382319: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] %8 = bitcast [2 x [2 x float]]* %5 to float*
2017-06-30 07:16:55.382334: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] %9 = bitcast [2 x [2 x float]]* %2 to float*
2017-06-30 07:16:55.382338: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] call void @__xla_cpu_runtime_EigenMatMulF32(i8* %run_options, float* %7, float* %8, float* %9, i64 2, i64 2, i64 2, i32 0, i32 0)
2017-06-30 07:16:55.382343: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] %prof_counter_computation = getelementptr i64, i64* %prof_counters, i64 0
2017-06-30 07:16:55.382346: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] ret void
2017-06-30 07:16:55.382350: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] }
2017-06-30 07:16:55.382352: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]
2017-06-30 07:16:55.382355: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] ; Function Attrs: argmemonly nounwind
2017-06-30 07:16:55.382359: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] declare void @__xla_cpu_runtime_EigenMatMulF32(i8*, float*, float*, float*, i64, i64, i64, i32, i32) #0
2017-06-30 07:16:55.382363: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]
2017-06-30 07:16:55.382366: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] attributes #0 = { argmemonly nounwind }
2017-06-30 07:16:55.382378: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100]
2017-06-30 07:16:55.382381: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] !0 = !{!1, !1, i64 0}
2017-06-30 07:16:55.382385: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] !1 = !{!"pointer-to element_type: F32 dimensions: 2 dimensions: 2 layout { minor_to_major: 1 minor_to_major: 0 }", !2}
2017-06-30 07:16:55.382513: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] !2 = !{!"XLA TBAA"}
2017-06-30 07:16:55.382533: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:100] !3 = !{i64 16}
2017-06-30 07:16:55.385209: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] _cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=0,_XlaNumResourceArgs=0].v5:
2017-06-30 07:16:55.385224: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000000 pushq %r14
2017-06-30 07:16:55.385229: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000002 pushq %rbx
2017-06-30 07:16:55.385232: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000003 subq $72, %rsp
2017-06-30 07:16:55.385236: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000007 movl $2, %eax
2017-06-30 07:16:55.385240: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000000c movl %eax, %r9d
2017-06-30 07:16:55.385244: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000000f xorl %eax, %eax
2017-06-30 07:16:55.385248: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000011 movq (%rdx), %r10
2017-06-30 07:16:55.385251: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000014 movq 8(%rdx), %rdx
2017-06-30 07:16:55.385255: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000018 movq %rsp, %r11
2017-06-30 07:16:55.385258: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000001b movl $0, 16(%r11)
2017-06-30 07:16:55.385262: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000023 movl $0, 8(%r11)
2017-06-30 07:16:55.385266: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000002b movq $2, (%r11)
2017-06-30 07:16:55.385269: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000032 movabsq $0, %r11
2017-06-30 07:16:55.385273: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000003c movl $2, %ebx
2017-06-30 07:16:55.385277: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000041 movl %ebx, %r14d
2017-06-30 07:16:55.385280: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000044 movq %rdi, 64(%rsp)
2017-06-30 07:16:55.385284: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000049 movq %rsi, %rdi
2017-06-30 07:16:55.385288: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000004c movq 64(%rsp), %rsi
2017-06-30 07:16:55.385292: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000051 movq %rcx, 56(%rsp)
2017-06-30 07:16:55.385295: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000056 movq %r10, %rcx
2017-06-30 07:16:55.385299: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000059 movq %r8, 48(%rsp)
2017-06-30 07:16:55.385303: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000005e movq %r14, %r8
2017-06-30 07:16:55.385306: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000061 movq %r9, 40(%rsp)
2017-06-30 07:16:55.385310: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000066 movq %r14, %r9
2017-06-30 07:16:55.385314: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000069 movl %eax, 36(%rsp)
2017-06-30 07:16:55.385317: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000006d callq *%r11
2017-06-30 07:16:55.385321: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000070 addq $72, %rsp
2017-06-30 07:16:55.385325: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000074 popq %rbx
2017-06-30 07:16:55.385328: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000075 popq %r14
2017-06-30 07:16:55.385331: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000077 retq
Are you trying to run the code on a version of Tensorflow built with the XLA options set? Perhaps try on a fresh clean repository with the XLA configuration?
I don't know what your goal is here, so maybe this isn't helpful.
However, perhaps it is to experiment with different implementations of MatMul. If so, as you can see above, matmul is implemented as a call to a library function, so you could just replace the internals of that library function. It is called __xla_cpu_runtime_EigenSingleThreadedMatMulF32, in the file tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc. It calls a template function which makes some Eigen vector dot operations. You could replace the __xla_cpu_runtime_EigenSingleThreadedMatMulF32 with whatever C code you like, or some inline assembler.
My aim is actually to obtain LLVM IR for the matmul operation in XLA.
As you have pointed out -. call void @__xla_cpu_runtime_EigenMatMulF32
. If that could be replaced with the non-eigen implementation of matmul in LLVM IR (I did the same with Conv operator. Here - link - you can see the if (PotentiallyImplementedAsEigenConvolution(*convolution)) {
which I negated to always fallback to the non-eigen implementation - this helped me get LLVM IR for convolution)
I wish to do the same and get LLVM IR for the matmul operator.
Thanks again @DavidNorman
i see. somewhere there must be the code which sets up the stack frame and emits the code seen in the trace (this stuff):
2017-06-30 07:16:55.385224: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000000 pushq %r14
2017-06-30 07:16:55.385229: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000002 pushq %rbx
2017-06-30 07:16:55.385232: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000003 subq $72, %rsp
2017-06-30 07:16:55.385236: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000007 movl $2, %eax
2017-06-30 07:16:55.385240: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000000c movl %eax, %r9d
2017-06-30 07:16:55.385244: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000000f xorl %eax, %eax
2017-06-30 07:16:55.385248: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000011 movq (%rdx), %r10
2017-06-30 07:16:55.385251: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000014 movq 8(%rdx), %rdx
2017-06-30 07:16:55.385255: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000018 movq %rsp, %r11
2017-06-30 07:16:55.385258: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000001b movl $0, 16(%r11)
2017-06-30 07:16:55.385262: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000023 movl $0, 8(%r11)
2017-06-30 07:16:55.385266: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000002b movq $2, (%r11)
2017-06-30 07:16:55.385269: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000032 movabsq $0, %r11
2017-06-30 07:16:55.385273: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000003c movl $2, %ebx
2017-06-30 07:16:55.385277: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000041 movl %ebx, %r14d
2017-06-30 07:16:55.385280: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000044 movq %rdi, 64(%rsp)
2017-06-30 07:16:55.385284: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000049 movq %rsi, %rdi
2017-06-30 07:16:55.385288: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000004c movq 64(%rsp), %rsi
2017-06-30 07:16:55.385292: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000051 movq %rcx, 56(%rsp)
2017-06-30 07:16:55.385295: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000056 movq %r10, %rcx
2017-06-30 07:16:55.385299: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000059 movq %r8, 48(%rsp)
2017-06-30 07:16:55.385303: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000005e movq %r14, %r8
2017-06-30 07:16:55.385306: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000061 movq %r9, 40(%rsp)
2017-06-30 07:16:55.385310: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000066 movq %r14, %r9
2017-06-30 07:16:55.385314: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000069 movl %eax, 36(%rsp)
2017-06-30 07:16:55.385317: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x0000006d callq *%r11
2017-06-30 07:16:55.385321: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000070 addq $72, %rsp
2017-06-30 07:16:55.385325: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000074 popq %rbx
2017-06-30 07:16:55.385328: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000075 popq %r14
2017-06-30 07:16:55.385331: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:122] 0x00000077 retq
I'm guessing this is written out using LLVM, so maybe you can find that and then add some alternative implementation. See DotOpEmitter::Emit
in tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
.
@DavidNorman +1.
I changed
if (PotentiallyImplementedAsEigenDot(dot_)) {
return EmitCallToRuntime();
}
to
if (PotentiallyImplementedAsEigenDot(dot_) && 0 ) {
return EmitCallToRuntime();
}
Still no XLA computation for matmul. Something else is wrong.
Could you change it in your codebase here and let me know if it works?
Thanks!
it is working for me.
are you definitely using a version of tensorflow that has XLA built in? You are if you are targeting a device called /device:XLA_CPU:0
2017-07-03 13:36:20.455473: I tensorflow/compiler/xla/service/hlo_instruction.cc:2097] Visiting HLO %dot
2017-07-03 13:36:20.455509: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:807] HandleDot:
2017-07-03 13:36:20.455519: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:808] lhs operand: %2 = bitcast i8* %1 to [2 x [2 x float]]*
2017-07-03 13:36:20.455527: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:810] rhs operand: %5 = bitcast i8* %4 to [2 x [2 x float]]*
2017-07-03 13:36:20.455533: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:812] target: %6 = bitcast i8* %retval to [2 x [2 x float]]*
2017-07-03 13:36:20.455623: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:1746] FinishVisit root: %dot = f32[2,2]{1,0} dot(f32[2,2]{1,0} %arg0, f32[2,2]{1,0} %arg1) # metadata=op_type: "MatMul" op_name: "MatMul"
2017-07-03 13:36:20.455638: I tensorflow/compiler/xla/service/cpu/ir_emitter.cc:1748] value: %6 = bitcast i8* %retval to [2 x [2 x float]]*
2017-07-03 13:36:20.455662: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:64] IR before optimizations
2017-07-03 13:36:20.455735: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] ; ModuleID = '__compute_module'
2017-07-03 13:36:20.455740: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] source_filename = "__compute_module"
2017-07-03 13:36:20.455745: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
2017-07-03 13:36:20.455748: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] target triple = "x86_64-apple-darwin"
2017-07-03 13:36:20.455752: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]
2017-07-03 13:36:20.455756: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] define void @"cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=0,_XlaNumResourceArgs=0].v5"(i8* align 16 dereferenceable(16) %retval, i8* noalias %run_options, i8** noalias %params, i8** noalias %temps, i64* noalias %prof_counters) {
2017-07-03 13:36:20.455761: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] entry:
2017-07-03 13:36:20.455764: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %accum_address = alloca float
2017-07-03 13:36:20.455768: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %invar_address.reduction = alloca i64
2017-07-03 13:36:20.455771: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %invar_address.rhs1 = alloca i64
2017-07-03 13:36:20.455775: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %invar_address.lhs0 = alloca i64
2017-07-03 13:36:20.455779: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %0 = getelementptr inbounds i8*, i8** %params, i64 0
2017-07-03 13:36:20.455783: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %1 = load i8*, i8** %0, !dereferenceable !0, !align !0
2017-07-03 13:36:20.455787: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %2 = bitcast i8* %1 to [2 x [2 x float]]*
2017-07-03 13:36:20.455791: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %3 = getelementptr inbounds i8*, i8** %params, i64 1
2017-07-03 13:36:20.455795: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %4 = load i8*, i8** %3, !dereferenceable !0, !align !0
2017-07-03 13:36:20.455799: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %5 = bitcast i8* %4 to [2 x [2 x float]]*
2017-07-03 13:36:20.455803: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %6 = bitcast i8* %retval to [2 x [2 x float]]*
2017-07-03 13:36:20.455808: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] store i64 0, i64* %invar_address.lhs0
2017-07-03 13:36:20.455812: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] br label %loop_header.lhs0
2017-07-03 13:36:20.455815: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]
2017-07-03 13:36:20.455819: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_header.lhs0: ; preds = %loop_exit.rhs1, %entry
2017-07-03 13:36:20.455823: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %indvar.lhs0 = load i64, i64* %invar_address.lhs0
2017-07-03 13:36:20.455827: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %7 = icmp uge i64 %indvar.lhs0, 2
2017-07-03 13:36:20.455832: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] br i1 %7, label %loop_exit.lhs0, label %loop_body.lhs0
2017-07-03 13:36:20.455835: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]
2017-07-03 13:36:20.455839: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_body.lhs0: ; preds = %loop_header.lhs0
2017-07-03 13:36:20.455843: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] store i64 0, i64* %invar_address.rhs1
2017-07-03 13:36:20.455847: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] br label %loop_header.rhs1
2017-07-03 13:36:20.455851: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]
2017-07-03 13:36:20.455855: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_header.rhs1: ; preds = %loop_exit.reduction, %loop_body.lhs0
2017-07-03 13:36:20.455860: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %indvar.rhs1 = load i64, i64* %invar_address.rhs1
2017-07-03 13:36:20.455864: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %8 = icmp uge i64 %indvar.rhs1, 2
2017-07-03 13:36:20.455868: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] br i1 %8, label %loop_exit.rhs1, label %loop_body.rhs1
2017-07-03 13:36:20.455872: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]
2017-07-03 13:36:20.455876: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_body.rhs1: ; preds = %loop_header.rhs1
2017-07-03 13:36:20.455880: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] store i64 0, i64* %invar_address.reduction
2017-07-03 13:36:20.455884: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] store float 0.000000e+00, float* %accum_address
2017-07-03 13:36:20.455897: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] br label %loop_header.reduction
2017-07-03 13:36:20.455900: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]
2017-07-03 13:36:20.455904: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_header.reduction: ; preds = %loop_body.reduction, %loop_body.rhs1
2017-07-03 13:36:20.455908: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %indvar.reduction = load i64, i64* %invar_address.reduction
2017-07-03 13:36:20.455912: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %9 = icmp uge i64 %indvar.reduction, 2
2017-07-03 13:36:20.455916: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] br i1 %9, label %loop_exit.reduction, label %loop_body.reduction
2017-07-03 13:36:20.455919: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]
2017-07-03 13:36:20.455923: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_body.reduction: ; preds = %loop_header.reduction
2017-07-03 13:36:20.455927: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %10 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %2, i64 0, i64 %indvar.lhs0, i64 %indvar.reduction
2017-07-03 13:36:20.455932: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %11 = load float, float* %10, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.455937: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %12 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %5, i64 0, i64 %indvar.reduction, i64 %indvar.rhs1
2017-07-03 13:36:20.455941: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %13 = load float, float* %12, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.455945: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %14 = fmul fast float %11, %13
2017-07-03 13:36:20.455949: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %15 = load float, float* %accum_address
2017-07-03 13:36:20.455952: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %16 = fadd fast float %15, %14
2017-07-03 13:36:20.455956: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] store float %16, float* %accum_address
2017-07-03 13:36:20.455960: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %invar.inc2 = add nuw nsw i64 %indvar.reduction, 1
2017-07-03 13:36:20.455964: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] store i64 %invar.inc2, i64* %invar_address.reduction
2017-07-03 13:36:20.455968: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] br label %loop_header.reduction
2017-07-03 13:36:20.455971: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]
2017-07-03 13:36:20.455975: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_exit.reduction: ; preds = %loop_header.reduction
2017-07-03 13:36:20.455978: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %17 = load float, float* %accum_address
2017-07-03 13:36:20.455983: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %18 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %6, i64 0, i64 %indvar.lhs0, i64 %indvar.rhs1
2017-07-03 13:36:20.455987: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] store float %17, float* %18, !alias.scope !2
2017-07-03 13:36:20.455991: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %invar.inc1 = add nuw nsw i64 %indvar.rhs1, 1
2017-07-03 13:36:20.455994: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] store i64 %invar.inc1, i64* %invar_address.rhs1
2017-07-03 13:36:20.455998: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] br label %loop_header.rhs1
2017-07-03 13:36:20.456002: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]
2017-07-03 13:36:20.456005: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_exit.rhs1: ; preds = %loop_header.rhs1
2017-07-03 13:36:20.456009: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %invar.inc = add nuw nsw i64 %indvar.lhs0, 1
2017-07-03 13:36:20.456013: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] store i64 %invar.inc, i64* %invar_address.lhs0
2017-07-03 13:36:20.456017: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] br label %loop_header.lhs0
2017-07-03 13:36:20.456021: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]
2017-07-03 13:36:20.456024: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] loop_exit.lhs0: ; preds = %loop_header.lhs0
2017-07-03 13:36:20.456028: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] %prof_counter_computation = getelementptr i64, i64* %prof_counters, i64 0
2017-07-03 13:36:20.456032: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] ret void
2017-07-03 13:36:20.456035: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] }
2017-07-03 13:36:20.456038: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65]
2017-07-03 13:36:20.456041: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] !0 = !{i64 16}
2017-07-03 13:36:20.456044: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] !1 = !{}
2017-07-03 13:36:20.456047: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] !2 = !{!3}
2017-07-03 13:36:20.456051: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] !3 = !{!"buffer: {index:0, offset:0, size:16}", !4}
2017-07-03 13:36:20.456054: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:65] !4 = distinct !{!4}
2017-07-03 13:36:20.461396: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:91] IR after optimizations
2017-07-03 13:36:20.461468: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] ; ModuleID = '__compute_module'
2017-07-03 13:36:20.461475: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] source_filename = "__compute_module"
2017-07-03 13:36:20.461479: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
2017-07-03 13:36:20.461482: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] target triple = "x86_64-apple-darwin"
2017-07-03 13:36:20.461497: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]
2017-07-03 13:36:20.461500: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] ; Function Attrs: norecurse nounwind
2017-07-03 13:36:20.461505: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] define void @"cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=0,_XlaNumResourceArgs=0].v5"(i8* nocapture align 16 dereferenceable(16) %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readonly %params, i8** noalias nocapture readnone %temps, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
2017-07-03 13:36:20.461509: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] entry:
2017-07-03 13:36:20.461513: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %0 = bitcast i8** %params to [2 x [2 x float]]**
2017-07-03 13:36:20.461516: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %1 = load [2 x [2 x float]]*, [2 x [2 x float]]** %0, align 8, !dereferenceable !0, !align !0
2017-07-03 13:36:20.461520: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %2 = getelementptr inbounds i8*, i8** %params, i64 1
2017-07-03 13:36:20.461524: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %3 = bitcast i8** %2 to [2 x [2 x float]]**
2017-07-03 13:36:20.461527: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %4 = load [2 x [2 x float]]*, [2 x [2 x float]]** %3, align 8, !dereferenceable !0, !align !0
2017-07-03 13:36:20.461541: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %5 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %1, i64 0, i64 0, i64 0
2017-07-03 13:36:20.461638: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %6 = load float, float* %5, align 16, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461659: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %7 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %4, i64 0, i64 0, i64 0
2017-07-03 13:36:20.461663: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %8 = load float, float* %7, align 16, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461667: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %9 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %1, i64 0, i64 0, i64 1
2017-07-03 13:36:20.461671: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %10 = load float, float* %9, align 4, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461674: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %11 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %4, i64 0, i64 1, i64 0
2017-07-03 13:36:20.461678: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %12 = load float, float* %11, align 8, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461682: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %13 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %4, i64 0, i64 0, i64 1
2017-07-03 13:36:20.461685: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %14 = load float, float* %13, align 4, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461689: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %15 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %4, i64 0, i64 1, i64 1
2017-07-03 13:36:20.461693: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %16 = load float, float* %15, align 4, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461828: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %17 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %1, i64 0, i64 1, i64 0
2017-07-03 13:36:20.461849: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %18 = load float, float* %17, align 8, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461854: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %19 = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %1, i64 0, i64 1, i64 1
2017-07-03 13:36:20.461858: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %20 = load float, float* %19, align 4, !invariant.load !1, !noalias !2
2017-07-03 13:36:20.461863: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %21 = insertelement <4 x float> undef, float %8, i32 0
2017-07-03 13:36:20.461867: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %22 = insertelement <4 x float> %21, float %14, i32 1
2017-07-03 13:36:20.461871: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %23 = insertelement <4 x float> %22, float %8, i32 2
2017-07-03 13:36:20.461875: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %24 = insertelement <4 x float> %23, float %14, i32 3
2017-07-03 13:36:20.461878: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %25 = insertelement <4 x float> undef, float %6, i32 0
2017-07-03 13:36:20.461882: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %26 = insertelement <4 x float> %25, float %6, i32 1
2017-07-03 13:36:20.461896: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %27 = insertelement <4 x float> %26, float %18, i32 2
2017-07-03 13:36:20.461992: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %28 = insertelement <4 x float> %27, float %18, i32 3
2017-07-03 13:36:20.462012: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %29 = fmul fast <4 x float> %24, %28
2017-07-03 13:36:20.462017: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %30 = insertelement <4 x float> undef, float %12, i32 0
2017-07-03 13:36:20.462021: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %31 = insertelement <4 x float> %30, float %16, i32 1
2017-07-03 13:36:20.462024: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %32 = insertelement <4 x float> %31, float %12, i32 2
2017-07-03 13:36:20.462028: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %33 = insertelement <4 x float> %32, float %16, i32 3
2017-07-03 13:36:20.462032: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %34 = insertelement <4 x float> undef, float %10, i32 0
2017-07-03 13:36:20.462036: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %35 = insertelement <4 x float> %34, float %10, i32 1
2017-07-03 13:36:20.462040: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %36 = insertelement <4 x float> %35, float %20, i32 2
2017-07-03 13:36:20.462044: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %37 = insertelement <4 x float> %36, float %20, i32 3
2017-07-03 13:36:20.462048: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %38 = fmul fast <4 x float> %33, %37
2017-07-03 13:36:20.462061: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %39 = fadd fast <4 x float> %38, %29
2017-07-03 13:36:20.462064: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] %40 = bitcast i8* %retval to <4 x float>*
2017-07-03 13:36:20.462168: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] store <4 x float> %39, <4 x float>* %40, align 16, !alias.scope !2
2017-07-03 13:36:20.462177: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] ret void
2017-07-03 13:36:20.462191: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] }
2017-07-03 13:36:20.462194: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]
2017-07-03 13:36:20.462197: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] attributes #0 = { norecurse nounwind }
2017-07-03 13:36:20.462201: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92]
2017-07-03 13:36:20.462203: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] !0 = !{i64 16}
2017-07-03 13:36:20.462206: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] !1 = !{}
2017-07-03 13:36:20.462209: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] !2 = !{!3}
2017-07-03 13:36:20.462212: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] !3 = !{!"buffer: {index:0, offset:0, size:16}", !4}
2017-07-03 13:36:20.462216: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:92] !4 = distinct !{!4}
2017-07-03 13:36:20.465761: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] _cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=0,_XlaNumResourceArgs=0].v5:
2017-07-03 13:36:20.465777: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000000 movq (%rdx), %rax
2017-07-03 13:36:20.465783: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000003 movq 8(%rdx), %rcx
2017-07-03 13:36:20.465787: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000007 vmovsd (%rcx), %xmm0
2017-07-03 13:36:20.465791: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x0000000b vmovsd 8(%rcx), %xmm1
2017-07-03 13:36:20.465795: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000010 vinsertps $32, (%rcx), %xmm0, %xmm0
2017-07-03 13:36:20.465799: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000016 vinsertps $48, 4(%rcx), %xmm0, %xmm0
2017-07-03 13:36:20.465803: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x0000001d vmovss 8(%rax), %xmm2
2017-07-03 13:36:20.465808: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000022 vmovss (%rax), %xmm3
2017-07-03 13:36:20.465811: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000026 vmovss 4(%rax), %xmm4
2017-07-03 13:36:20.465815: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x0000002b vshufps $0, %xmm2, %xmm3, %xmm2
2017-07-03 13:36:20.465819: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000030 vmulps %xmm2, %xmm0, %xmm0
2017-07-03 13:36:20.465822: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000034 vinsertps $32, 8(%rcx), %xmm1, %xmm1
2017-07-03 13:36:20.465826: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x0000003b vinsertps $48, 12(%rcx), %xmm1, %xmm1
2017-07-03 13:36:20.465830: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000042 vmovss 12(%rax), %xmm2
2017-07-03 13:36:20.465834: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000047 vshufps $0, %xmm2, %xmm4, %xmm2
2017-07-03 13:36:20.465837: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x0000004c vfmadd213ps %xmm0, %xmm1, %xmm2
2017-07-03 13:36:20.465841: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000051 vmovaps %xmm2, (%rdi)
2017-07-03 13:36:20.465844: I tensorflow/compiler/xla/service/cpu/compiler_functor.cc:118] 0x00000055 retq
try something like ( i may have the JIT enable setting slightly wrong ):
The way I would do it is: