inferrna · November 28, 2016 16:35
diff --git a/out.py b/out.py
 $ OFFSET32_BIT=1 CLANG_HOME=/usr/lib/llvm-3.8 py.test -svx test/test_compile.py -k pointerpointer.cu-myte6kernel
 ======================================================================== test session starts =========================================================================
 platform linux -- Python 3.5.2, pytest-2.9.1, py-1.4.31, pluggy-0.3.1 -- /usr/bin/python3
 cachedir: .cache
 rootdir: /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl, inifile: pytest.ini
 collecting 0 itemsmarking xfail
 marking xfail
 marking xfail
 marking xfail
 collected 9 items 

 test/test_compile.py::test_compile[test/pointerpointer.cu-myte6kernel] context <pyopencl.Context at 0x1c2fc60 on <pyopencl.Device 'Pitcairn' on 'AMD Accelerated Parallel Processing' at 0x1cbcbd0>>
 options []
 bin/cocl -c /tmp/testprog.cu
 warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
 1 warning generated.
 warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
 warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
 2 warnings generated.
 clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
 clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
 warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
 1 warning generated.

 build/ir-to-opencl --inputfile /tmp/testprog-device.ll --outputfile /tmp/testprog-device.cl --kernelname  --add_ir_to_cl
 terminate called after throwing an instance of 'std::runtime_error'
  what():  Couldnt find kernel 

 mangledname _Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii
 options []
 bin/cocl -c /tmp/testprog.cu
 warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
 1 warning generated.
 warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
 warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
 2 warnings generated.
 clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
 clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
 warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
 1 warning generated.

 build/ir-to-opencl --inputfile /tmp/testprog-device.ll --outputfile /tmp/testprog-device.cl --kernelname _Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii --add_ir_to_cl

 FAILED
 ====================================================================== short test summary info =======================================================================
 FAIL test/test_compile.py::test_compile[test/pointerpointer.cu-myte6kernel]

 ============================================================================== FAILURES ==============================================================================
 __________________________________________________________ test_compile[test/pointerpointer.cu-myte6kernel] __________________________________________________________

 context = <pyopencl.Context at 0x1c2fc60 on <pyopencl.Device 'Pitcairn' on 'AMD Accelerated Parallel Processing' at 0x1cbcbd0>>
 cu_filepath = 'test/pointerpointer.cu', kernelname = 'myte6kernel'

    @pytest.mark.parametrize("cu_filepath,kernelname", get_test_definitions())
    def test_compile(context, cu_filepath, kernelname):
        with open(cu_filepath, 'r') as f:
            cu_code = f.read()
    
        try:
            cl_code = test_common.cu_to_cl(cu_code, '')
        except:
            pass
    
        with open('/tmp/testprog-device.ll') as f:
            ll_code = f.read()
    
        for line in ll_code.split('\n'):
            if line.startswith('define') and kernelname in line:
                mangledname = line.split('@')[1].split('(')[0]
                break
    
        print('mangledname', mangledname)
    
        cl_code = test_common.cu_to_cl(cu_code, mangledname)
    
 >       test_common.build_kernel(context, cl_code, mangledname)

 test/test_compile.py:47: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
 test/test_common.py:232: in build_kernel
    prog = cl.Program(context, cl_sourcecode).build()
 /usr/local/lib/python3.5/dist-packages/pyopencl/__init__.py:382: in build
    options=options, source=self._source)
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 self = <pyopencl.Program object at 0x7f30a1f62c88>, build_func = <function Program.build.<locals>.<lambda> at 0x7f30b019fe18>
 options = ['-I', '/usr/local/lib/python3.5/dist-packages/pyopencl/cl']
 source = 'struct class_HalfImpl {\n    short f0;\n};\nstruct class_GpuDevice {\n    int f0;\n    global struct class_StreamInte...itofp v8 */;\n    v10 = (float)(v6 + 123);\n    /* void v11 = store v10 data */;\n    data[0] = v10;\n    return;\n}\n'

    def _build_and_catch_errors(self, build_func, options, source=None):
        try:
            return build_func()
        except _cl.RuntimeError as e:
            what = e.what
            if options:
                what = what + "\n(options: %s)" % " ".join(options)
    
            if source is not None:
                from tempfile import NamedTemporaryFile
                srcfile = NamedTemporaryFile(mode="wt", delete=False, suffix=".cl")
                try:
                    srcfile.write(source)
                finally:
                    srcfile.close()
    
                what = what + "\n(source saved as %s)" % srcfile.name
    
            code = e.code
            routine = e.routine
    
            err = _cl.RuntimeError(
                    _ErrorRecord(
                        what=lambda: what,
                        code=lambda: code,
                        routine=lambda: routine))
    
        # Python 3.2 outputs the whole list of currently active exceptions
        # This serves to remove one (redundant) level from that nesting.
 >       raise err
 E       pyopencl.cffi_cl.RuntimeError: clbuildprogram failed: BUILD_PROGRAM_FAILURE - 
 E       
 E       Build on <pyopencl.Device 'Pitcairn' on 'AMD Accelerated Parallel Processing' at 0x1cbcbd0>:
 E       
 E       "/tmp/OCL32651T1.cl", line 34: error: kernel arguments can't be declared with
 E                 types
 E                 bool/half/size_t/ptrdiff_t/intptr_t/uintptr_t/pointer-to-pointer
 E         kernel void _Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii(global struct class_TensorEvaluator6* structs, uint structs_offset, global float* data, uint data_offset, global struct class_GpuDevice* gpudevices, uint gpudevices_offset, int a, int b, int c, local int *scratch);
 E                                                                                                                                                                                                                 ^
 E       
 E       "/tmp/OCL32651T1.cl", line 36: error: kernel arguments can't be declared with
 E                 types
 E                 bool/half/size_t/ptrdiff_t/intptr_t/uintptr_t/pointer-to-pointer
 E         kernel void _Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii(global struct class_TensorEvaluator6* structs, uint structs_offset, global float* data, uint data_offset, global struct class_GpuDevice* gpudevices, uint gpudevices_offset, int a, int b, int c, local int *scratch) {
 E                                                                                                                                                                                                                 ^
 E       
 E       "/tmp/OCL32651T1.cl", line 46: warning: label "v1" was declared but never
 E                 referenced
 E         v1:;
 E         ^
 E       
 E       2 errors detected in the compilation of "/tmp/OCL32651T1.cl".
 E       Frontend phase failed compilation.
 E       
 E       (options: -I /usr/local/lib/python3.5/dist-packages/pyopencl/cl)
 E       (source saved as /tmp/tmpf8iufbw0.cl)

 /usr/local/lib/python3.5/dist-packages/pyopencl/__init__.py:417: RuntimeError
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Interrupted: stopping after 1 failures !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ====================================================== 8 tests deselected by '-kpointerpointer.cu-myte6kernel' =======================================================
 =============================================================== 1 failed, 8 deselected in 6.30 seconds ===============================================================
diff --git a/testprog-device.cl b/testprog-device.cl
 struct class_HalfImpl {
    short f0;
 };
 struct class_GpuDevice {
    int f0;
    global struct class_StreamInterface* f1;
 };
 struct class_StreamInterface {
    char f0;
 };
 struct class_HalfBase {
    struct class_HalfImpl f0;
 };
 struct class_TensorEvaluator0 {
    global struct class_Half* f0;
    struct class_GpuDevice f1;
 };
 struct class_Half {
    struct class_HalfBase f0;
 };
 struct class_TensorEvaluator2 {
    global struct class_Half* f0;
    struct class_GpuDevice f1;
 };
 struct class_TensorEvaluator7 {
    global struct class_Half* f0;
    struct class_TensorEvaluator2 f1;
 };
 struct class_TensorEvaluator6 {
    struct class_TensorEvaluator0 f0;
    struct class_TensorEvaluator7 f1;
 };

 kernel void _Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii(global struct class_TensorEvaluator6* structs, uint structs_offset, global float* data, uint data_offset, global struct class_GpuDevice* gpudevices, uint gpudevices_offset, int a, int b, int c, local int *scratch);

 kernel void _Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii(global struct class_TensorEvaluator6* structs, uint structs_offset, global float* data, uint data_offset, global struct class_GpuDevice* gpudevices, uint gpudevices_offset, int a, int b, int c, local int *scratch) {
    gpudevices += gpudevices_offset;
    data += data_offset;
    structs += structs_offset;

    float v10;
    global struct class_Half* v4;
    long v2;
    short v6;

 v1:;
    /* long v2 = sext a */;
    v2 = a;
    /* struct class_Half** v3 = getelementptr structs v2 <unk> <unk> */;
    /* struct class_Half* v4 = load v3 */;
    v4 = (&(structs[v2].f0.f0))[0];
    /* short* v5 = getelementptr v4 v2 <unk> <unk> <unk> */;
    /* short v6 = load v5 */;
    v6 = (&(v4[v2].f0.f0.f0))[0];
    /* int v7 = sext v6 */;
    /* int v8 = add v7 <unk> */;
    /* float v10 = sitofp v8 */;
    v10 = (float)(v6 + 123);
    /* void v11 = store v10 data */;
    data[0] = v10;
    return;
 }
diff --git a/testprog-device.ll b/testprog-device.ll
 ; ModuleID = '/tmp/testprog-device-noopt.ll'
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"

 %struct.MyStruct = type { float, i32 }
 %class.Half = type { %class.HalfBase }
 %class.HalfBase = type { %class.HalfImpl }
 %class.HalfImpl = type { i16 }
 %class.TensorEvaluator6 = type { %class.TensorEvaluator0, %class.TensorEvaluator7 }
 %class.TensorEvaluator0 = type { %class.Half*, %class.GpuDevice }
 %class.GpuDevice = type { i32, %class.StreamInterface* }
 %class.StreamInterface = type { i8 }
 %class.TensorEvaluator7 = type { %class.Half*, %class.TensorEvaluator2 }
 %class.TensorEvaluator2 = type { %class.Half*, %class.GpuDevice }

 @.str = private unnamed_addr constant [5 x i8] c"NONE\00", align 1
 @llvm.used = appending global [1 x i8*] [i8* bitcast (i32 ()* @_ZL21__nvvm_reflect_anchorv to i8*)], section "llvm.metadata"

 ; Function Attrs: nounwind readnone
 define internal i32 @_ZL21__nvvm_reflect_anchorv() #0 {
  %1 = tail call i32 @__nvvm_reflect(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0)) #3
  ret i32 %1
 }

 ; Function Attrs: nounwind readnone
 declare i32 @__nvvm_reflect(i8*) #0

 ; Function Attrs: norecurse nounwind readonly
 define float @_Z9sumStructPP8MyStructi(%struct.MyStruct** nocapture readonly %p_structs, i32 %N) #1 {
  %1 = icmp sgt i32 %N, 0
  br i1 %1, label %.lr.ph.preheader, label %._crit_edge

 .lr.ph.preheader:                                 ; preds = %0
  %xtraiter = and i32 %N, 1
  %lcmp.mod = icmp eq i32 %xtraiter, 0
  br i1 %lcmp.mod, label %.lr.ph.preheader.split, label %.lr.ph.prol

 .lr.ph.prol:                                      ; preds = %.lr.ph.preheader
  %2 = load %struct.MyStruct*, %struct.MyStruct** %p_structs, align 8, !tbaa !9
  %3 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %2, i64 0, i32 0
  %4 = load float, float* %3, align 4, !tbaa !13
  %5 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %2, i64 0, i32 1
  %6 = load i32, i32* %5, align 4, !tbaa !17
  %7 = sitofp i32 %6 to float
  %8 = fmul float %7, 3.500000e+00
  %9 = fadd float %4, %8
  %10 = fadd float %9, 0.000000e+00
  br label %.lr.ph.preheader.split

 .lr.ph.preheader.split:                           ; preds = %.lr.ph.prol, %.lr.ph.preheader
  %.lcssa.unr = phi float [ undef, %.lr.ph.preheader ], [ %10, %.lr.ph.prol ]
  %sum.02.unr = phi float [ 0.000000e+00, %.lr.ph.preheader ], [ %10, %.lr.ph.prol ]
  %i.01.unr = phi i32 [ 0, %.lr.ph.preheader ], [ 1, %.lr.ph.prol ]
  %11 = icmp eq i32 %N, 1
  br i1 %11, label %._crit_edge.loopexit, label %.lr.ph.preheader.split.split

 .lr.ph.preheader.split.split:                     ; preds = %.lr.ph.preheader.split
  br label %.lr.ph

 ._crit_edge.loopexit.unr-lcssa:                   ; preds = %.lr.ph
  %.lcssa3 = phi float [ %34, %.lr.ph ]
  br label %._crit_edge.loopexit

 ._crit_edge.loopexit:                             ; preds = %._crit_edge.loopexit.unr-lcssa, %.lr.ph.preheader.split
  %.lcssa = phi float [ %.lcssa.unr, %.lr.ph.preheader.split ], [ %.lcssa3, %._crit_edge.loopexit.unr-lcssa ]
  br label %._crit_edge

 ._crit_edge:                                      ; preds = %._crit_edge.loopexit, %0
  %sum.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %.lcssa, %._crit_edge.loopexit ]
  ret float %sum.0.lcssa

 .lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader.split.split
  %sum.02 = phi float [ %sum.02.unr, %.lr.ph.preheader.split.split ], [ %34, %.lr.ph ]
  %i.01 = phi i32 [ %i.01.unr, %.lr.ph.preheader.split.split ], [ %35, %.lr.ph ]
  %12 = sext i32 %i.01 to i64
  %13 = getelementptr inbounds %struct.MyStruct*, %struct.MyStruct** %p_structs, i64 %12
  %14 = load %struct.MyStruct*, %struct.MyStruct** %13, align 8, !tbaa !9
  %15 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %14, i64 0, i32 0
  %16 = load float, float* %15, align 4, !tbaa !13
  %17 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %14, i64 0, i32 1
  %18 = load i32, i32* %17, align 4, !tbaa !17
  %19 = sitofp i32 %18 to float
  %20 = fmul float %19, 3.500000e+00
  %21 = fadd float %16, %20
  %22 = fadd float %sum.02, %21
  %23 = add nuw nsw i32 %i.01, 1
  %24 = sext i32 %23 to i64
  %25 = getelementptr inbounds %struct.MyStruct*, %struct.MyStruct** %p_structs, i64 %24
  %26 = load %struct.MyStruct*, %struct.MyStruct** %25, align 8, !tbaa !9
  %27 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %26, i64 0, i32 0
  %28 = load float, float* %27, align 4, !tbaa !13
  %29 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %26, i64 0, i32 1
  %30 = load i32, i32* %29, align 4, !tbaa !17
  %31 = sitofp i32 %30 to float
  %32 = fmul float %31, 3.500000e+00
  %33 = fadd float %28, %32
  %34 = fadd float %22, %33
  %35 = add nsw i32 %i.01, 2
  %exitcond.1 = icmp eq i32 %35, %N
  br i1 %exitcond.1, label %._crit_edge.loopexit.unr-lcssa, label %.lr.ph
 }

 ; Function Attrs: norecurse nounwind
 define void @_Z8mykernelPfP8MyStructi(float* nocapture %data, %struct.MyStruct* %structs, i32 %N) #2 {
  %1 = icmp sgt i32 %N, 0
  %2 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %structs, i64 0, i32 0
  br i1 %1, label %.lr.ph.i.preheader, label %._Z9sumStructPP8MyStructi.exit_crit_edge

 ._Z9sumStructPP8MyStructi.exit_crit_edge:         ; preds = %0
  %.pre17 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %structs, i64 0, i32 1
  br label %_Z9sumStructPP8MyStructi.exit

 .lr.ph.i.preheader:                               ; preds = %0
  %3 = load float, float* %2, align 4
  %4 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %structs, i64 0, i32 1
  %5 = load i32, i32* %4, align 4
  %6 = sitofp i32 %5 to float
  %7 = fmul float %6, 3.500000e+00
  %8 = fadd float %3, %7
  %9 = add i32 %N, -1
  %xtraiter = and i32 %N, 7
  %lcmp.mod = icmp eq i32 %xtraiter, 0
  br i1 %lcmp.mod, label %.lr.ph.i.preheader.split, label %.lr.ph.i.prol.preheader

 .lr.ph.i.prol.preheader:                          ; preds = %.lr.ph.i.preheader
  br label %.lr.ph.i.prol

 .lr.ph.i.prol:                                    ; preds = %.lr.ph.i.prol, %.lr.ph.i.prol.preheader
  %sum.02.i.prol = phi float [ %10, %.lr.ph.i.prol ], [ 0.000000e+00, %.lr.ph.i.prol.preheader ]
  %i.01.i.prol = phi i32 [ %11, %.lr.ph.i.prol ], [ 0, %.lr.ph.i.prol.preheader ]
  %prol.iter = phi i32 [ %prol.iter.sub, %.lr.ph.i.prol ], [ %xtraiter, %.lr.ph.i.prol.preheader ]
  %10 = fadd float %sum.02.i.prol, %8
  %11 = add nuw nsw i32 %i.01.i.prol, 1
  %prol.iter.sub = add i32 %prol.iter, -1
  %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
  br i1 %prol.iter.cmp, label %.lr.ph.i.preheader.split.loopexit, label %.lr.ph.i.prol, !llvm.loop !18

 .lr.ph.i.preheader.split.loopexit:                ; preds = %.lr.ph.i.prol
  %.lcssa28 = phi i32 [ %11, %.lr.ph.i.prol ]
  %.lcssa27 = phi float [ %10, %.lr.ph.i.prol ]
  br label %.lr.ph.i.preheader.split

 .lr.ph.i.preheader.split:                         ; preds = %.lr.ph.i.preheader.split.loopexit, %.lr.ph.i.preheader
  %.lcssa24.unr = phi float [ undef, %.lr.ph.i.preheader ], [ %.lcssa27, %.lr.ph.i.preheader.split.loopexit ]
  %sum.02.i.unr = phi float [ 0.000000e+00, %.lr.ph.i.preheader ], [ %.lcssa27, %.lr.ph.i.preheader.split.loopexit ]
  %i.01.i.unr = phi i32 [ 0, %.lr.ph.i.preheader ], [ %.lcssa28, %.lr.ph.i.preheader.split.loopexit ]
  %12 = icmp ult i32 %9, 7
  br i1 %12, label %_Z9sumStructPP8MyStructi.exit.loopexit, label %.lr.ph.i.preheader.split.split

 .lr.ph.i.preheader.split.split:                   ; preds = %.lr.ph.i.preheader.split
  br label %.lr.ph.i

 .lr.ph.i:                                         ; preds = %.lr.ph.i, %.lr.ph.i.preheader.split.split
  %sum.02.i = phi float [ %sum.02.i.unr, %.lr.ph.i.preheader.split.split ], [ %20, %.lr.ph.i ]
  %i.01.i = phi i32 [ %i.01.i.unr, %.lr.ph.i.preheader.split.split ], [ %21, %.lr.ph.i ]
  %13 = fadd float %sum.02.i, %8
  %14 = fadd float %13, %8
  %15 = fadd float %14, %8
  %16 = fadd float %15, %8
  %17 = fadd float %16, %8
  %18 = fadd float %17, %8
  %19 = fadd float %18, %8
  %20 = fadd float %19, %8
  %21 = add nsw i32 %i.01.i, 8
  %exitcond.i.7 = icmp eq i32 %21, %N
  br i1 %exitcond.i.7, label %_Z9sumStructPP8MyStructi.exit.loopexit.unr-lcssa, label %.lr.ph.i

 _Z9sumStructPP8MyStructi.exit.loopexit.unr-lcssa: ; preds = %.lr.ph.i
  %.lcssa26 = phi float [ %20, %.lr.ph.i ]
  br label %_Z9sumStructPP8MyStructi.exit.loopexit

 _Z9sumStructPP8MyStructi.exit.loopexit:           ; preds = %_Z9sumStructPP8MyStructi.exit.loopexit.unr-lcssa, %.lr.ph.i.preheader.split
  %.lcssa24 = phi float [ %.lcssa24.unr, %.lr.ph.i.preheader.split ], [ %.lcssa26, %_Z9sumStructPP8MyStructi.exit.loopexit.unr-lcssa ]
  br label %_Z9sumStructPP8MyStructi.exit

 _Z9sumStructPP8MyStructi.exit:                    ; preds = %_Z9sumStructPP8MyStructi.exit.loopexit, %._Z9sumStructPP8MyStructi.exit_crit_edge
  %.pre-phi18 = phi i32* [ %.pre17, %._Z9sumStructPP8MyStructi.exit_crit_edge ], [ %4, %_Z9sumStructPP8MyStructi.exit.loopexit ]
  %sum.0.lcssa.i = phi float [ 0.000000e+00, %._Z9sumStructPP8MyStructi.exit_crit_edge ], [ %.lcssa24, %_Z9sumStructPP8MyStructi.exit.loopexit ]
  store float %sum.0.lcssa.i, float* %data, align 4, !tbaa !20
  %22 = load float, float* %2, align 4
  %23 = load i32, i32* %.pre-phi18, align 4
  %24 = sitofp i32 %23 to float
  %25 = fmul float %24, 3.500000e+00
  %26 = fadd float %22, %25
  br label %.lr.ph.i11

 .lr.ph.i11:                                       ; preds = %.lr.ph.i11, %_Z9sumStructPP8MyStructi.exit
  %sum.02.i8 = phi float [ 0.000000e+00, %_Z9sumStructPP8MyStructi.exit ], [ %29, %.lr.ph.i11 ]
  %i.01.i9 = phi i32 [ 0, %_Z9sumStructPP8MyStructi.exit ], [ %30, %.lr.ph.i11 ]
  %27 = fadd float %sum.02.i8, %26
  %28 = fadd float %27, %26
  %29 = fadd float %28, %26
  %30 = add nsw i32 %i.01.i9, 3
  %exitcond.i10.2 = icmp eq i32 %30, 123
  br i1 %exitcond.i10.2, label %_Z9sumStructPP8MyStructi.exit12, label %.lr.ph.i11

 _Z9sumStructPP8MyStructi.exit12:                  ; preds = %.lr.ph.i11
  %.lcssa25 = phi float [ %29, %.lr.ph.i11 ]
  %31 = getelementptr inbounds float, float* %data, i64 3
  store float %.lcssa25, float* %31, align 4, !tbaa !20
  %32 = load float, float* %2, align 4
  %33 = load i32, i32* %.pre-phi18, align 4
  %34 = sitofp i32 %33 to float
  %35 = fmul float %34, 3.500000e+00
  %36 = fadd float %32, %35
  br label %.lr.ph.i5

 .lr.ph.i5:                                        ; preds = %.lr.ph.i5, %_Z9sumStructPP8MyStructi.exit12
  %sum.02.i2 = phi float [ 0.000000e+00, %_Z9sumStructPP8MyStructi.exit12 ], [ %51, %.lr.ph.i5 ]
  %i.01.i3 = phi i32 [ 0, %_Z9sumStructPP8MyStructi.exit12 ], [ %52, %.lr.ph.i5 ]
  %37 = fadd float %sum.02.i2, %36
  %38 = fadd float %37, %36
  %39 = fadd float %38, %36
  %40 = fadd float %39, %36
  %41 = fadd float %40, %36
  %42 = fadd float %41, %36
  %43 = fadd float %42, %36
  %44 = fadd float %43, %36
  %45 = fadd float %44, %36
  %46 = fadd float %45, %36
  %47 = fadd float %46, %36
  %48 = fadd float %47, %36
  %49 = fadd float %48, %36
  %50 = fadd float %49, %36
  %51 = fadd float %50, %36
  %52 = add nsw i32 %i.01.i3, 15
  %exitcond.i4.14 = icmp eq i32 %52, 12300
  br i1 %exitcond.i4.14, label %_Z9sumStructPP8MyStructi.exit6, label %.lr.ph.i5

 _Z9sumStructPP8MyStructi.exit6:                   ; preds = %.lr.ph.i5
  %.lcssa = phi float [ %51, %.lr.ph.i5 ]
  %53 = getelementptr inbounds float, float* %data, i64 4
  store float %.lcssa, float* %53, align 4, !tbaa !20
  ret void
 }

 ; Function Attrs: norecurse nounwind readonly
 define float @_Z12getHalfValueP4Halfi(%class.Half* nocapture readonly %half_, i32 %a) #1 {
  %1 = sext i32 %a to i64
  %2 = getelementptr inbounds %class.Half, %class.Half* %half_, i64 %1, i32 0, i32 0, i32 0
  %3 = load i16, i16* %2, align 2, !tbaa !21
  %4 = sext i16 %3 to i32
  %5 = add nsw i32 %4, 123
  %6 = sitofp i32 %5 to float
  ret float %6
 }

 ; Function Attrs: norecurse nounwind
 define void @_Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii(%class.TensorEvaluator6* nocapture readonly %structs, float* nocapture %data, %class.GpuDevice* nocapture readnone %gpudevices, i32 %a, i32 %b, i32 %c) #2 {
  %1 = sext i32 %a to i64
  %2 = getelementptr inbounds %class.TensorEvaluator6, %class.TensorEvaluator6* %structs, i64 %1, i32 0, i32 0
  %3 = load %class.Half*, %class.Half** %2, align 8, !tbaa !24
  %4 = getelementptr inbounds %class.Half, %class.Half* %3, i64 %1, i32 0, i32 0, i32 0
  %5 = load i16, i16* %4, align 2, !tbaa !21
  %6 = sext i16 %5 to i32
  %7 = add nsw i32 %6, 123
  %8 = sitofp i32 %7 to float
  store float %8, float* %data, align 4, !tbaa !20
  ret void
 }

 attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_30" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { norecurse nounwind readonly "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_30" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_30" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #3 = { nounwind readnone }

 !nvvm.annotations = !{!0, !1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4}
 !llvm.module.flags = !{!6}
 !llvm.ident = !{!7}
 !nvvm.internalize.after.link = !{}
 !nvvmir.version = !{!8}

 !0 = !{void (float*, %struct.MyStruct*, i32)* @_Z8mykernelPfP8MyStructi, !"kernel", i32 1}
 !1 = !{void (%class.TensorEvaluator6*, float*, %class.GpuDevice*, i32, i32, i32)* @_Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii, !"kernel", i32 1}
 !2 = !{null, !"align", i32 8}
 !3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !4 = !{null, !"align", i32 16}
 !5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !6 = !{i32 1, !"PIC Level", i32 2}
 !7 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
 !8 = !{i32 1, i32 2}
 !9 = !{!10, !10, i64 0}
 !10 = !{!"any pointer", !11, i64 0}
 !11 = !{!"omnipotent char", !12, i64 0}
 !12 = !{!"Simple C/C++ TBAA"}
 !13 = !{!14, !15, i64 0}
 !14 = !{!"_ZTS8MyStruct", !15, i64 0, !16, i64 4}
 !15 = !{!"float", !11, i64 0}
 !16 = !{!"int", !11, i64 0}
 !17 = !{!14, !16, i64 4}
 !18 = distinct !{!18, !19}
 !19 = !{!"llvm.loop.unroll.disable"}
 !20 = !{!15, !15, i64 0}
 !21 = !{!22, !23, i64 0}
 !22 = !{!"_ZTS8HalfImpl", !23, i64 0}
 !23 = !{!"short", !11, i64 0}
 !24 = !{!25, !10, i64 0}
 !25 = !{!"_ZTS16TensorEvaluator6", !26, i64 0, !28, i64 24}
 !26 = !{!"_ZTS16TensorEvaluator0", !10, i64 0, !27, i64 8}
 !27 = !{!"_ZTS9GpuDevice", !16, i64 0, !10, i64 8}
 !28 = !{!"_ZTS16TensorEvaluator7", !10, i64 0, !29, i64 8}
 !29 = !{!"_ZTS16TensorEvaluator2", !10, i64 0, !27, i64 8}
	$ OFFSET32_BIT=1 CLANG_HOME=/usr/lib/llvm-3.8 py.test -svx test/test_compile.py -k pointerpointer.cu-myte6kernel
	======================================================================== test session starts =========================================================================
	platform linux -- Python 3.5.2, pytest-2.9.1, py-1.4.31, pluggy-0.3.1 -- /usr/bin/python3
	cachedir: .cache
	rootdir: /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl, inifile: pytest.ini
	collecting 0 itemsmarking xfail
	marking xfail
	marking xfail
	marking xfail
	collected 9 items

	test/test_compile.py::test_compile[test/pointerpointer.cu-myte6kernel] context <pyopencl.Context at 0x1c2fc60 on <pyopencl.Device 'Pitcairn' on 'AMD Accelerated Parallel Processing' at 0x1cbcbd0>>
	options []
	bin/cocl -c /tmp/testprog.cu
	warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
	1 warning generated.
	warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
	warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
	2 warnings generated.
	clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
	clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
	warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
	1 warning generated.

	build/ir-to-opencl --inputfile /tmp/testprog-device.ll --outputfile /tmp/testprog-device.cl --kernelname --add_ir_to_cl
	terminate called after throwing an instance of 'std::runtime_error'
	what(): Couldnt find kernel

	mangledname _Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii
	options []
	bin/cocl -c /tmp/testprog.cu
	warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
	1 warning generated.
	warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
	warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
	2 warnings generated.
	clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
	clang: warning: argument unused during compilation: '-I /usr/lib/llvm-3.8/include'
	warning: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Wunknown-warning-option]
	1 warning generated.

	build/ir-to-opencl --inputfile /tmp/testprog-device.ll --outputfile /tmp/testprog-device.cl --kernelname _Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii --add_ir_to_cl

	FAILED
	====================================================================== short test summary info =======================================================================
	FAIL test/test_compile.py::test_compile[test/pointerpointer.cu-myte6kernel]

	============================================================================== FAILURES ==============================================================================
	__________________________________________________________ test_compile[test/pointerpointer.cu-myte6kernel] __________________________________________________________

	context = <pyopencl.Context at 0x1c2fc60 on <pyopencl.Device 'Pitcairn' on 'AMD Accelerated Parallel Processing' at 0x1cbcbd0>>
	cu_filepath = 'test/pointerpointer.cu', kernelname = 'myte6kernel'

	@pytest.mark.parametrize("cu_filepath,kernelname", get_test_definitions())
	def test_compile(context, cu_filepath, kernelname):
	with open(cu_filepath, 'r') as f:
	cu_code = f.read()

	try:
	cl_code = test_common.cu_to_cl(cu_code, '')
	except:
	pass

	with open('/tmp/testprog-device.ll') as f:
	ll_code = f.read()

	for line in ll_code.split('\n'):
	if line.startswith('define') and kernelname in line:
	mangledname = line.split('@')[1].split('(')[0]
	break

	print('mangledname', mangledname)

	cl_code = test_common.cu_to_cl(cu_code, mangledname)

	> test_common.build_kernel(context, cl_code, mangledname)

	test/test_compile.py:47:
	_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
	test/test_common.py:232: in build_kernel
	prog = cl.Program(context, cl_sourcecode).build()
	/usr/local/lib/python3.5/dist-packages/pyopencl/__init__.py:382: in build
	options=options, source=self._source)
	_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

	self = <pyopencl.Program object at 0x7f30a1f62c88>, build_func = <function Program.build.<locals>.<lambda> at 0x7f30b019fe18>
	options = ['-I', '/usr/local/lib/python3.5/dist-packages/pyopencl/cl']
	source = 'struct class_HalfImpl {\n short f0;\n};\nstruct class_GpuDevice {\n int f0;\n global struct class_StreamInte...itofp v8 /;\n v10 = (float)(v6 + 123);\n / void v11 = store v10 data */;\n data[0] = v10;\n return;\n}\n'

	def _build_and_catch_errors(self, build_func, options, source=None):
	try:
	return build_func()
	except _cl.RuntimeError as e:
	what = e.what
	if options:
	what = what + "\n(options: %s)" % " ".join(options)

	if source is not None:
	from tempfile import NamedTemporaryFile
	srcfile = NamedTemporaryFile(mode="wt", delete=False, suffix=".cl")
	try:
	srcfile.write(source)
	finally:
	srcfile.close()

	what = what + "\n(source saved as %s)" % srcfile.name

	code = e.code
	routine = e.routine

	err = _cl.RuntimeError(
	_ErrorRecord(
	what=lambda: what,
	code=lambda: code,
	routine=lambda: routine))

	# Python 3.2 outputs the whole list of currently active exceptions
	# This serves to remove one (redundant) level from that nesting.
	> raise err
	E pyopencl.cffi_cl.RuntimeError: clbuildprogram failed: BUILD_PROGRAM_FAILURE -
	E
	E Build on <pyopencl.Device 'Pitcairn' on 'AMD Accelerated Parallel Processing' at 0x1cbcbd0>:
	E
	E "/tmp/OCL32651T1.cl", line 34: error: kernel arguments can't be declared with
	E types
	E bool/half/size_t/ptrdiff_t/intptr_t/uintptr_t/pointer-to-pointer
	E kernel void _Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii(global struct class_TensorEvaluator6* structs, uint structs_offset, global float* data, uint data_offset, global struct class_GpuDevice* gpudevices, uint gpudevices_offset, int a, int b, int c, local int *scratch);
	E ^
	E
	E "/tmp/OCL32651T1.cl", line 36: error: kernel arguments can't be declared with
	E types
	E bool/half/size_t/ptrdiff_t/intptr_t/uintptr_t/pointer-to-pointer
	E kernel void _Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii(global struct class_TensorEvaluator6* structs, uint structs_offset, global float* data, uint data_offset, global struct class_GpuDevice* gpudevices, uint gpudevices_offset, int a, int b, int c, local int *scratch) {
	E ^
	E
	E "/tmp/OCL32651T1.cl", line 46: warning: label "v1" was declared but never
	E referenced
	E v1:;
	E ^
	E
	E 2 errors detected in the compilation of "/tmp/OCL32651T1.cl".
	E Frontend phase failed compilation.
	E
	E (options: -I /usr/local/lib/python3.5/dist-packages/pyopencl/cl)
	E (source saved as /tmp/tmpf8iufbw0.cl)

	/usr/local/lib/python3.5/dist-packages/pyopencl/__init__.py:417: RuntimeError
	!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Interrupted: stopping after 1 failures !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
	====================================================== 8 tests deselected by '-kpointerpointer.cu-myte6kernel' =======================================================
	=============================================================== 1 failed, 8 deselected in 6.30 seconds ===============================================================
	struct class_HalfImpl {
	short f0;
	};
	struct class_GpuDevice {
	int f0;
	global struct class_StreamInterface* f1;
	};
	struct class_StreamInterface {
	char f0;
	};
	struct class_HalfBase {
	struct class_HalfImpl f0;
	};
	struct class_TensorEvaluator0 {
	global struct class_Half* f0;
	struct class_GpuDevice f1;
	};
	struct class_Half {
	struct class_HalfBase f0;
	};
	struct class_TensorEvaluator2 {
	global struct class_Half* f0;
	struct class_GpuDevice f1;
	};
	struct class_TensorEvaluator7 {
	global struct class_Half* f0;
	struct class_TensorEvaluator2 f1;
	};
	struct class_TensorEvaluator6 {
	struct class_TensorEvaluator0 f0;
	struct class_TensorEvaluator7 f1;
	};

	kernel void _Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii(global struct class_TensorEvaluator6* structs, uint structs_offset, global float* data, uint data_offset, global struct class_GpuDevice* gpudevices, uint gpudevices_offset, int a, int b, int c, local int *scratch);

	kernel void _Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii(global struct class_TensorEvaluator6* structs, uint structs_offset, global float* data, uint data_offset, global struct class_GpuDevice* gpudevices, uint gpudevices_offset, int a, int b, int c, local int *scratch) {
	gpudevices += gpudevices_offset;
	data += data_offset;
	structs += structs_offset;

	float v10;
	global struct class_Half* v4;
	long v2;
	short v6;

	v1:;
	/* long v2 = sext a */;
	v2 = a;
	/* struct class_Half** v3 = getelementptr structs v2 <unk> <unk> */;
	/* struct class_Half* v4 = load v3 */;
	v4 = (&(structs[v2].f0.f0))[0];
	/* short* v5 = getelementptr v4 v2 <unk> <unk> <unk> */;
	/* short v6 = load v5 */;
	v6 = (&(v4[v2].f0.f0.f0))[0];
	/* int v7 = sext v6 */;
	/* int v8 = add v7 <unk> */;
	/* float v10 = sitofp v8 */;
	v10 = (float)(v6 + 123);
	/* void v11 = store v10 data */;
	data[0] = v10;
	return;
	}
	; ModuleID = '/tmp/testprog-device-noopt.ll'
	target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
	target triple = "nvptx64-nvidia-cuda"

	%struct.MyStruct = type { float, i32 }
	%class.Half = type { %class.HalfBase }
	%class.HalfBase = type { %class.HalfImpl }
	%class.HalfImpl = type { i16 }
	%class.TensorEvaluator6 = type { %class.TensorEvaluator0, %class.TensorEvaluator7 }
	%class.TensorEvaluator0 = type { %class.Half*, %class.GpuDevice }
	%class.GpuDevice = type { i32, %class.StreamInterface* }
	%class.StreamInterface = type { i8 }
	%class.TensorEvaluator7 = type { %class.Half*, %class.TensorEvaluator2 }
	%class.TensorEvaluator2 = type { %class.Half*, %class.GpuDevice }

	@.str = private unnamed_addr constant [5 x i8] c"NONE\00", align 1
	@llvm.used = appending global [1 x i8] [i8 bitcast (i32 ()* @_ZL21__nvvm_reflect_anchorv to i8*)], section "llvm.metadata"

	; Function Attrs: nounwind readnone
	define internal i32 @_ZL21__nvvm_reflect_anchorv() #0 {
	%1 = tail call i32 @__nvvm_reflect(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0)) #3
	ret i32 %1
	}

	; Function Attrs: nounwind readnone
	declare i32 @__nvvm_reflect(i8*) #0

	; Function Attrs: norecurse nounwind readonly
	define float @_Z9sumStructPP8MyStructi(%struct.MyStruct** nocapture readonly %p_structs, i32 %N) #1 {
	%1 = icmp sgt i32 %N, 0
	br i1 %1, label %.lr.ph.preheader, label %._crit_edge

	.lr.ph.preheader: ; preds = %0
	%xtraiter = and i32 %N, 1
	%lcmp.mod = icmp eq i32 %xtraiter, 0
	br i1 %lcmp.mod, label %.lr.ph.preheader.split, label %.lr.ph.prol

	.lr.ph.prol: ; preds = %.lr.ph.preheader
	%2 = load %struct.MyStruct, %struct.MyStruct* %p_structs, align 8, !tbaa !9
	%3 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %2, i64 0, i32 0
	%4 = load float, float* %3, align 4, !tbaa !13
	%5 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %2, i64 0, i32 1
	%6 = load i32, i32* %5, align 4, !tbaa !17
	%7 = sitofp i32 %6 to float
	%8 = fmul float %7, 3.500000e+00
	%9 = fadd float %4, %8
	%10 = fadd float %9, 0.000000e+00
	br label %.lr.ph.preheader.split

	.lr.ph.preheader.split: ; preds = %.lr.ph.prol, %.lr.ph.preheader
	%.lcssa.unr = phi float [ undef, %.lr.ph.preheader ], [ %10, %.lr.ph.prol ]
	%sum.02.unr = phi float [ 0.000000e+00, %.lr.ph.preheader ], [ %10, %.lr.ph.prol ]
	%i.01.unr = phi i32 [ 0, %.lr.ph.preheader ], [ 1, %.lr.ph.prol ]
	%11 = icmp eq i32 %N, 1
	br i1 %11, label %._crit_edge.loopexit, label %.lr.ph.preheader.split.split

	.lr.ph.preheader.split.split: ; preds = %.lr.ph.preheader.split
	br label %.lr.ph

	._crit_edge.loopexit.unr-lcssa: ; preds = %.lr.ph
	%.lcssa3 = phi float [ %34, %.lr.ph ]
	br label %._crit_edge.loopexit

	._crit_edge.loopexit: ; preds = %._crit_edge.loopexit.unr-lcssa, %.lr.ph.preheader.split
	%.lcssa = phi float [ %.lcssa.unr, %.lr.ph.preheader.split ], [ %.lcssa3, %._crit_edge.loopexit.unr-lcssa ]
	br label %._crit_edge

	._crit_edge: ; preds = %._crit_edge.loopexit, %0
	%sum.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %.lcssa, %._crit_edge.loopexit ]
	ret float %sum.0.lcssa

	.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader.split.split
	%sum.02 = phi float [ %sum.02.unr, %.lr.ph.preheader.split.split ], [ %34, %.lr.ph ]
	%i.01 = phi i32 [ %i.01.unr, %.lr.ph.preheader.split.split ], [ %35, %.lr.ph ]
	%12 = sext i32 %i.01 to i64
	%13 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %p_structs, i64 %12
	%14 = load %struct.MyStruct, %struct.MyStruct* %13, align 8, !tbaa !9
	%15 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %14, i64 0, i32 0
	%16 = load float, float* %15, align 4, !tbaa !13
	%17 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %14, i64 0, i32 1
	%18 = load i32, i32* %17, align 4, !tbaa !17
	%19 = sitofp i32 %18 to float
	%20 = fmul float %19, 3.500000e+00
	%21 = fadd float %16, %20
	%22 = fadd float %sum.02, %21
	%23 = add nuw nsw i32 %i.01, 1
	%24 = sext i32 %23 to i64
	%25 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %p_structs, i64 %24
	%26 = load %struct.MyStruct, %struct.MyStruct* %25, align 8, !tbaa !9
	%27 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %26, i64 0, i32 0
	%28 = load float, float* %27, align 4, !tbaa !13
	%29 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %26, i64 0, i32 1
	%30 = load i32, i32* %29, align 4, !tbaa !17
	%31 = sitofp i32 %30 to float
	%32 = fmul float %31, 3.500000e+00
	%33 = fadd float %28, %32
	%34 = fadd float %22, %33
	%35 = add nsw i32 %i.01, 2
	%exitcond.1 = icmp eq i32 %35, %N
	br i1 %exitcond.1, label %._crit_edge.loopexit.unr-lcssa, label %.lr.ph
	}

	; Function Attrs: norecurse nounwind
	define void @_Z8mykernelPfP8MyStructi(float* nocapture %data, %struct.MyStruct* %structs, i32 %N) #2 {
	%1 = icmp sgt i32 %N, 0
	%2 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %structs, i64 0, i32 0
	br i1 %1, label %.lr.ph.i.preheader, label %._Z9sumStructPP8MyStructi.exit_crit_edge

	._Z9sumStructPP8MyStructi.exit_crit_edge: ; preds = %0
	%.pre17 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %structs, i64 0, i32 1
	br label %_Z9sumStructPP8MyStructi.exit

	.lr.ph.i.preheader: ; preds = %0
	%3 = load float, float* %2, align 4
	%4 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %structs, i64 0, i32 1
	%5 = load i32, i32* %4, align 4
	%6 = sitofp i32 %5 to float
	%7 = fmul float %6, 3.500000e+00
	%8 = fadd float %3, %7
	%9 = add i32 %N, -1
	%xtraiter = and i32 %N, 7
	%lcmp.mod = icmp eq i32 %xtraiter, 0
	br i1 %lcmp.mod, label %.lr.ph.i.preheader.split, label %.lr.ph.i.prol.preheader

	.lr.ph.i.prol.preheader: ; preds = %.lr.ph.i.preheader
	br label %.lr.ph.i.prol

	.lr.ph.i.prol: ; preds = %.lr.ph.i.prol, %.lr.ph.i.prol.preheader
	%sum.02.i.prol = phi float [ %10, %.lr.ph.i.prol ], [ 0.000000e+00, %.lr.ph.i.prol.preheader ]
	%i.01.i.prol = phi i32 [ %11, %.lr.ph.i.prol ], [ 0, %.lr.ph.i.prol.preheader ]
	%prol.iter = phi i32 [ %prol.iter.sub, %.lr.ph.i.prol ], [ %xtraiter, %.lr.ph.i.prol.preheader ]
	%10 = fadd float %sum.02.i.prol, %8
	%11 = add nuw nsw i32 %i.01.i.prol, 1
	%prol.iter.sub = add i32 %prol.iter, -1
	%prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
	br i1 %prol.iter.cmp, label %.lr.ph.i.preheader.split.loopexit, label %.lr.ph.i.prol, !llvm.loop !18

	.lr.ph.i.preheader.split.loopexit: ; preds = %.lr.ph.i.prol
	%.lcssa28 = phi i32 [ %11, %.lr.ph.i.prol ]
	%.lcssa27 = phi float [ %10, %.lr.ph.i.prol ]
	br label %.lr.ph.i.preheader.split

	.lr.ph.i.preheader.split: ; preds = %.lr.ph.i.preheader.split.loopexit, %.lr.ph.i.preheader
	%.lcssa24.unr = phi float [ undef, %.lr.ph.i.preheader ], [ %.lcssa27, %.lr.ph.i.preheader.split.loopexit ]
	%sum.02.i.unr = phi float [ 0.000000e+00, %.lr.ph.i.preheader ], [ %.lcssa27, %.lr.ph.i.preheader.split.loopexit ]
	%i.01.i.unr = phi i32 [ 0, %.lr.ph.i.preheader ], [ %.lcssa28, %.lr.ph.i.preheader.split.loopexit ]
	%12 = icmp ult i32 %9, 7
	br i1 %12, label %_Z9sumStructPP8MyStructi.exit.loopexit, label %.lr.ph.i.preheader.split.split

	.lr.ph.i.preheader.split.split: ; preds = %.lr.ph.i.preheader.split
	br label %.lr.ph.i

	.lr.ph.i: ; preds = %.lr.ph.i, %.lr.ph.i.preheader.split.split
	%sum.02.i = phi float [ %sum.02.i.unr, %.lr.ph.i.preheader.split.split ], [ %20, %.lr.ph.i ]
	%i.01.i = phi i32 [ %i.01.i.unr, %.lr.ph.i.preheader.split.split ], [ %21, %.lr.ph.i ]
	%13 = fadd float %sum.02.i, %8
	%14 = fadd float %13, %8
	%15 = fadd float %14, %8
	%16 = fadd float %15, %8
	%17 = fadd float %16, %8
	%18 = fadd float %17, %8
	%19 = fadd float %18, %8
	%20 = fadd float %19, %8
	%21 = add nsw i32 %i.01.i, 8
	%exitcond.i.7 = icmp eq i32 %21, %N
	br i1 %exitcond.i.7, label %_Z9sumStructPP8MyStructi.exit.loopexit.unr-lcssa, label %.lr.ph.i

	_Z9sumStructPP8MyStructi.exit.loopexit.unr-lcssa: ; preds = %.lr.ph.i
	%.lcssa26 = phi float [ %20, %.lr.ph.i ]
	br label %_Z9sumStructPP8MyStructi.exit.loopexit

	_Z9sumStructPP8MyStructi.exit.loopexit: ; preds = %_Z9sumStructPP8MyStructi.exit.loopexit.unr-lcssa, %.lr.ph.i.preheader.split
	%.lcssa24 = phi float [ %.lcssa24.unr, %.lr.ph.i.preheader.split ], [ %.lcssa26, %_Z9sumStructPP8MyStructi.exit.loopexit.unr-lcssa ]
	br label %_Z9sumStructPP8MyStructi.exit

	_Z9sumStructPP8MyStructi.exit: ; preds = %_Z9sumStructPP8MyStructi.exit.loopexit, %._Z9sumStructPP8MyStructi.exit_crit_edge
	%.pre-phi18 = phi i32* [ %.pre17, %._Z9sumStructPP8MyStructi.exit_crit_edge ], [ %4, %_Z9sumStructPP8MyStructi.exit.loopexit ]
	%sum.0.lcssa.i = phi float [ 0.000000e+00, %._Z9sumStructPP8MyStructi.exit_crit_edge ], [ %.lcssa24, %_Z9sumStructPP8MyStructi.exit.loopexit ]
	store float %sum.0.lcssa.i, float* %data, align 4, !tbaa !20
	%22 = load float, float* %2, align 4
	%23 = load i32, i32* %.pre-phi18, align 4
	%24 = sitofp i32 %23 to float
	%25 = fmul float %24, 3.500000e+00
	%26 = fadd float %22, %25
	br label %.lr.ph.i11

	.lr.ph.i11: ; preds = %.lr.ph.i11, %_Z9sumStructPP8MyStructi.exit
	%sum.02.i8 = phi float [ 0.000000e+00, %_Z9sumStructPP8MyStructi.exit ], [ %29, %.lr.ph.i11 ]
	%i.01.i9 = phi i32 [ 0, %_Z9sumStructPP8MyStructi.exit ], [ %30, %.lr.ph.i11 ]
	%27 = fadd float %sum.02.i8, %26
	%28 = fadd float %27, %26
	%29 = fadd float %28, %26
	%30 = add nsw i32 %i.01.i9, 3
	%exitcond.i10.2 = icmp eq i32 %30, 123
	br i1 %exitcond.i10.2, label %_Z9sumStructPP8MyStructi.exit12, label %.lr.ph.i11

	_Z9sumStructPP8MyStructi.exit12: ; preds = %.lr.ph.i11
	%.lcssa25 = phi float [ %29, %.lr.ph.i11 ]
	%31 = getelementptr inbounds float, float* %data, i64 3
	store float %.lcssa25, float* %31, align 4, !tbaa !20
	%32 = load float, float* %2, align 4
	%33 = load i32, i32* %.pre-phi18, align 4
	%34 = sitofp i32 %33 to float
	%35 = fmul float %34, 3.500000e+00
	%36 = fadd float %32, %35
	br label %.lr.ph.i5

	.lr.ph.i5: ; preds = %.lr.ph.i5, %_Z9sumStructPP8MyStructi.exit12
	%sum.02.i2 = phi float [ 0.000000e+00, %_Z9sumStructPP8MyStructi.exit12 ], [ %51, %.lr.ph.i5 ]
	%i.01.i3 = phi i32 [ 0, %_Z9sumStructPP8MyStructi.exit12 ], [ %52, %.lr.ph.i5 ]
	%37 = fadd float %sum.02.i2, %36
	%38 = fadd float %37, %36
	%39 = fadd float %38, %36
	%40 = fadd float %39, %36
	%41 = fadd float %40, %36
	%42 = fadd float %41, %36
	%43 = fadd float %42, %36
	%44 = fadd float %43, %36
	%45 = fadd float %44, %36
	%46 = fadd float %45, %36
	%47 = fadd float %46, %36
	%48 = fadd float %47, %36
	%49 = fadd float %48, %36
	%50 = fadd float %49, %36
	%51 = fadd float %50, %36
	%52 = add nsw i32 %i.01.i3, 15
	%exitcond.i4.14 = icmp eq i32 %52, 12300
	br i1 %exitcond.i4.14, label %_Z9sumStructPP8MyStructi.exit6, label %.lr.ph.i5

	_Z9sumStructPP8MyStructi.exit6: ; preds = %.lr.ph.i5
	%.lcssa = phi float [ %51, %.lr.ph.i5 ]
	%53 = getelementptr inbounds float, float* %data, i64 4
	store float %.lcssa, float* %53, align 4, !tbaa !20
	ret void
	}

	; Function Attrs: norecurse nounwind readonly
	define float @_Z12getHalfValueP4Halfi(%class.Half* nocapture readonly %half_, i32 %a) #1 {
	%1 = sext i32 %a to i64
	%2 = getelementptr inbounds %class.Half, %class.Half* %half_, i64 %1, i32 0, i32 0, i32 0
	%3 = load i16, i16* %2, align 2, !tbaa !21
	%4 = sext i16 %3 to i32
	%5 = add nsw i32 %4, 123
	%6 = sitofp i32 %5 to float
	ret float %6
	}

	; Function Attrs: norecurse nounwind
	define void @_Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii(%class.TensorEvaluator6* nocapture readonly %structs, float* nocapture %data, %class.GpuDevice* nocapture readnone %gpudevices, i32 %a, i32 %b, i32 %c) #2 {
	%1 = sext i32 %a to i64
	%2 = getelementptr inbounds %class.TensorEvaluator6, %class.TensorEvaluator6* %structs, i64 %1, i32 0, i32 0
	%3 = load %class.Half, %class.Half* %2, align 8, !tbaa !24
	%4 = getelementptr inbounds %class.Half, %class.Half* %3, i64 %1, i32 0, i32 0, i32 0
	%5 = load i16, i16* %4, align 2, !tbaa !21
	%6 = sext i16 %5 to i32
	%7 = add nsw i32 %6, 123
	%8 = sitofp i32 %7 to float
	store float %8, float* %data, align 4, !tbaa !20
	ret void
	}

	attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_30" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
	attributes #1 = { norecurse nounwind readonly "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_30" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
	attributes #2 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_30" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
	attributes #3 = { nounwind readnone }

	!nvvm.annotations = !{!0, !1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4}
	!llvm.module.flags = !{!6}
	!llvm.ident = !{!7}
	!nvvm.internalize.after.link = !{}
	!nvvmir.version = !{!8}

	!0 = !{void (float, %struct.MyStruct, i32)* @_Z8mykernelPfP8MyStructi, !"kernel", i32 1}
	!1 = !{void (%class.TensorEvaluator6, float, %class.GpuDevice, i32, i32, i32) @_Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii, !"kernel", i32 1}
	!2 = !{null, !"align", i32 8}
	!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
	!4 = !{null, !"align", i32 16}
	!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
	!6 = !{i32 1, !"PIC Level", i32 2}
	!7 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
	!8 = !{i32 1, i32 2}
	!9 = !{!10, !10, i64 0}
	!10 = !{!"any pointer", !11, i64 0}
	!11 = !{!"omnipotent char", !12, i64 0}
	!12 = !{!"Simple C/C++ TBAA"}
	!13 = !{!14, !15, i64 0}
	!14 = !{!"_ZTS8MyStruct", !15, i64 0, !16, i64 4}
	!15 = !{!"float", !11, i64 0}
	!16 = !{!"int", !11, i64 0}
	!17 = !{!14, !16, i64 4}
	!18 = distinct !{!18, !19}
	!19 = !{!"llvm.loop.unroll.disable"}
	!20 = !{!15, !15, i64 0}
	!21 = !{!22, !23, i64 0}
	!22 = !{!"_ZTS8HalfImpl", !23, i64 0}
	!23 = !{!"short", !11, i64 0}
	!24 = !{!25, !10, i64 0}
	!25 = !{!"_ZTS16TensorEvaluator6", !26, i64 0, !28, i64 24}
	!26 = !{!"_ZTS16TensorEvaluator0", !10, i64 0, !27, i64 8}
	!27 = !{!"_ZTS9GpuDevice", !16, i64 0, !10, i64 8}
	!28 = !{!"_ZTS16TensorEvaluator7", !10, i64 0, !29, i64 8}
	!29 = !{!"_ZTS16TensorEvaluator2", !10, i64 0, !27, i64 8}