inferrna · December 1, 2016 08:04
diff --git a/test_tf-out.py b/test_tf-out.py
 $ OFFSET32_BIT=1 CLANG_HOME=/usr/lib/llvm-3.8 py.test -svx test/tf/test_tf.py======================================================================== test session starts =========================================================================
 platform linux -- Python 3.5.2, pytest-2.9.1, py-1.4.31, pluggy-0.3.1 -- /usr/bin/python3
 cachedir: .cache
 rootdir: /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl, inifile: pytest.ini
 collected 2 items 

 test/tf/test_tf.py::test_cwise_sqrt context <pyopencl.Context at 0x222ece0 on <pyopencl.Device 'Pitcairn' on 'AMD Accelerated Parallel Processing' at 0x22b9f30>>
 options []
 opt_options []
 iropencl_options []
 running [/usr/lib/llvm-3.8/bin/opt -S test/tf/samples/cwise_op_gpu_sqrt-device-noopt.ll -o /tmp/test-opt.ll]

 running [build/ir-to-opencl --inputfile /tmp/test-opt.ll --outputfile /tmp/test-device.cl --kernelname _ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_]
 F name _ZN5Eigen8internal15EigenMetaKe
 running generation on _ZN5Eigen8internal15EigenMetaKe
 running generation on _ZN5Eigen15TensorEvaluatorIK0
 running generation on _ZN5Eigen15TensorEvaluatorIK1
 running generation on _ZN5Eigen15TensorEvaluatorIK1
 running generation on _ZN5Eigen15TensorEvaluatorIN2
 running generation on _ZN5Eigen15TensorEvaluatorIN9
 running generation on _ZNK5Eigen15TensorEvaluatorI3
 running generation on _ZNK5Eigen15TensorEvaluatorI8
 running generation on _ZN5Eigen15TensorEvaluatorIK1
 running generation on _ZN5Eigen8internal6pstoreIf64_gp
 running generation on _ZNK5Eigen15TensorEvaluatorI11
 running generation on _ZNK5Eigen15TensorEvaluatorI6
 running generation on _ZNK5Eigen8internal14scalar_10
 running generation on _ZNK5Eigen8internal14scalar_5
 running generation on _ZN5Eigen8internal5psqrtI6fl7

 creating program...
 building kernel...
 /usr/local/lib/python3.5/dist-packages/pyopencl/__init__.py:206: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
  "to see more.", CompilerWarning)
 running kernel...
 FAILED
 ====================================================================== short test summary info =======================================================================
 FAIL test/tf/test_tf.py::test_cwise_sqrt

 ============================================================================== FAILURES ==============================================================================
 __________________________________________________________________________ test_cwise_sqrt ___________________________________________________________________________

 context = <pyopencl.Context at 0x222ece0 on <pyopencl.Device 'Pitcairn' on 'AMD Accelerated Parallel Processing' at 0x22b9f30>>
 q = <pyopencl.cffi_cl.CommandQueue object at 0x7f22d07aa668>
 float_data = array([ 0.28847906, -0.46295407, -1.33800447, ...,  0.85320115,
       -0.70543814,  0.55171764], dtype=float32)
 float_data_gpu = <pyopencl.cffi_cl.Buffer object at 0x7f22d07aa630>

    @pytest.mark.skipif(os.environ.get('TRAVIS', None) == 'true', reason='fails on travis mac cpu, not looked into why yet')
    def test_cwise_sqrt(context, q, float_data, float_data_gpu):
        options = test_common.cocl_options()
        i = 0
        opt_options = []
        iropencl_options = []
        while i < len(options):
            if options[i] == '--devicell-opt':
                opt_options.append('-' + options[i + 1])
                i += 2
                continue
            if options[i] in ['--run_branching_transforms', '--branches_as_switch']:
                iropencl_options.append(options[i])
                i += 1
                continue
            raise Exception('unknown option ', options[i])
            i += 1
        print('opt_options', opt_options)
        print('iropencl_options', iropencl_options)
        myrun([
            join(CLANG_HOME, 'bin/opt')
        ] + opt_options + [
            '-S',
            'test/tf/samples/cwise_op_gpu_sqrt-device-noopt.ll',
            '-o', '/tmp/test-opt.ll'
        ])
    
        myrun([
            'build/ir-to-opencl'
        ] + iropencl_options + [
            '--inputfile', '/tmp/test-opt.ll',
            '--outputfile', '/tmp/test-device.cl',
            '--kernelname', '_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_'
        ])
    
        with open('/tmp/test-device.cl', 'r') as f:
            cl_sourcecode = f.read()
    
        print('creating program...')
        prog_unbuilt = cl.Program(context, cl_sourcecode)
        print('building kernel...')
        prog = prog_unbuilt.build()
    
        N = 10
    
        # global struct Eigen__TensorEvaluator_nopointers* eval_nopointers, global float* eval_ptr0, long eval_ptr0_offset, global float* eval_ptr1, long eval_ptr1_offset, int size, local int *scratch
    
        # what we need:
        # struct Eigen__TensorEvaluator_nopointers   Note that none of the values we copy across are actually use, so we can just create a sufficiently large buffer...
        # global float *eval_ptr0  => this will receive the result.  just create a sufficiently large buffer
        # ptr0_offset => 0
        # eval_ptr1 => will contian the data we want to reduce
        # eval_ptr1_offset=> 0
        # size =>  eg 10, to reduce 10 values
        # scratch => set to workgroupsize * sizeof(float)
    
        eval_nopointers_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)
    
        eval_ptr0 = np.zeros(1024, dtype=np.float32)
        eval_ptr0_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=eval_ptr0)
        eval_ptr0_offset = 0
    
        eval_ptr1 = np.random.uniform(0, 1, size=(1024,)).astype(np.float32) + 1.0
        eval_ptr1_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=eval_ptr1)
        eval_ptr1_offset = 0
    
        size = N
    
        global_size = 256
        workgroup_size = 256
        scratch = workgroup_size * 4
    
        print('running kernel...')
        prog.__getattr__('_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_'[:31])(
            q, (global_size,), (workgroup_size,),
            eval_nopointers_gpu,
            eval_ptr0_gpu, offset_type(eval_ptr0_offset),
            eval_ptr1_gpu, offset_type(eval_ptr1_offset),
            np.int32(size),
 >           cl.LocalMemory(scratch)
        )

 test/tf/test_tf.py:108: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
 /usr/local/lib/python3.5/dist-packages/pyopencl/__init__.py:995: in kernel_call
    return self._enqueue(self, queue, global_size, local_size, *args, **kwargs)
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 self = <pyopencl.cffi_cl.Kernel object at 0x7f22d079aef0>, queue = <pyopencl.cffi_cl.CommandQueue object at 0x7f22d07aa668>, global_size = (256,), local_size = (256,)
 arg0 = <pyopencl.cffi_cl.Buffer object at 0x7f22de3f1470>, arg1 = <pyopencl.cffi_cl.Buffer object at 0x7f22de3f1198>, arg2 = 0
 arg3 = <pyopencl.cffi_cl.Buffer object at 0x7f22d079ae80>, arg4 = 0, arg5 = 10, arg6 = <pyopencl.cffi_cl.LocalMemory object at 0x7f22d07d2f88>, global_offset = None
 g_times_l = None, wait_for = None

 >   ???
 E   pyopencl.cffi_cl.LogicError: when processing argument #3 (1-based): clsetkernelarg failed: INVALID_ARG_SIZE

 <generated function enqueue_knl__ZN5Eigen8internal15EigenMetaKe>:138: LogicError
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Interrupted: stopping after 1 failures !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ====================================================================== 1 failed in 0.81 seconds ======================================================================
	$ OFFSET32_BIT=1 CLANG_HOME=/usr/lib/llvm-3.8 py.test -svx test/tf/test_tf.py======================================================================== test session starts =========================================================================
	platform linux -- Python 3.5.2, pytest-2.9.1, py-1.4.31, pluggy-0.3.1 -- /usr/bin/python3
	cachedir: .cache
	rootdir: /media/Compressed/Drivers_bios/src/dev/tensorflow-cl/third_party/cuda-on-cl, inifile: pytest.ini
	collected 2 items

	test/tf/test_tf.py::test_cwise_sqrt context <pyopencl.Context at 0x222ece0 on <pyopencl.Device 'Pitcairn' on 'AMD Accelerated Parallel Processing' at 0x22b9f30>>
	options []
	opt_options []
	iropencl_options []
	running [/usr/lib/llvm-3.8/bin/opt -S test/tf/samples/cwise_op_gpu_sqrt-device-noopt.ll -o /tmp/test-opt.ll]

	running [build/ir-to-opencl --inputfile /tmp/test-opt.ll --outputfile /tmp/test-device.cl --kernelname _ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_]
	F name _ZN5Eigen8internal15EigenMetaKe
	running generation on _ZN5Eigen8internal15EigenMetaKe
	running generation on _ZN5Eigen15TensorEvaluatorIK0
	running generation on _ZN5Eigen15TensorEvaluatorIK1
	running generation on _ZN5Eigen15TensorEvaluatorIK1
	running generation on _ZN5Eigen15TensorEvaluatorIN2
	running generation on _ZN5Eigen15TensorEvaluatorIN9
	running generation on _ZNK5Eigen15TensorEvaluatorI3
	running generation on _ZNK5Eigen15TensorEvaluatorI8
	running generation on _ZN5Eigen15TensorEvaluatorIK1
	running generation on _ZN5Eigen8internal6pstoreIf64_gp
	running generation on _ZNK5Eigen15TensorEvaluatorI11
	running generation on _ZNK5Eigen15TensorEvaluatorI6
	running generation on _ZNK5Eigen8internal14scalar_10
	running generation on _ZNK5Eigen8internal14scalar_5
	running generation on _ZN5Eigen8internal5psqrtI6fl7

	creating program...
	building kernel...
	/usr/local/lib/python3.5/dist-packages/pyopencl/__init__.py:206: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
	"to see more.", CompilerWarning)
	running kernel...
	FAILED
	====================================================================== short test summary info =======================================================================
	FAIL test/tf/test_tf.py::test_cwise_sqrt

	============================================================================== FAILURES ==============================================================================
	__________________________________________________________________________ test_cwise_sqrt ___________________________________________________________________________

	context = <pyopencl.Context at 0x222ece0 on <pyopencl.Device 'Pitcairn' on 'AMD Accelerated Parallel Processing' at 0x22b9f30>>
	q = <pyopencl.cffi_cl.CommandQueue object at 0x7f22d07aa668>
	float_data = array([ 0.28847906, -0.46295407, -1.33800447, ..., 0.85320115,
	-0.70543814, 0.55171764], dtype=float32)
	float_data_gpu = <pyopencl.cffi_cl.Buffer object at 0x7f22d07aa630>

	@pytest.mark.skipif(os.environ.get('TRAVIS', None) == 'true', reason='fails on travis mac cpu, not looked into why yet')
	def test_cwise_sqrt(context, q, float_data, float_data_gpu):
	options = test_common.cocl_options()
	i = 0
	opt_options = []
	iropencl_options = []
	while i < len(options):
	if options[i] == '--devicell-opt':
	opt_options.append('-' + options[i + 1])
	i += 2
	continue
	if options[i] in ['--run_branching_transforms', '--branches_as_switch']:
	iropencl_options.append(options[i])
	i += 1
	continue
	raise Exception('unknown option ', options[i])
	i += 1
	print('opt_options', opt_options)
	print('iropencl_options', iropencl_options)
	myrun([
	join(CLANG_HOME, 'bin/opt')
	] + opt_options + [
	'-S',
	'test/tf/samples/cwise_op_gpu_sqrt-device-noopt.ll',
	'-o', '/tmp/test-opt.ll'
	])

	myrun([
	'build/ir-to-opencl'
	] + iropencl_options + [
	'--inputfile', '/tmp/test-opt.ll',
	'--outputfile', '/tmp/test-device.cl',
	'--kernelname', '_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_'
	])

	with open('/tmp/test-device.cl', 'r') as f:
	cl_sourcecode = f.read()

	print('creating program...')
	prog_unbuilt = cl.Program(context, cl_sourcecode)
	print('building kernel...')
	prog = prog_unbuilt.build()

	N = 10

	# global struct Eigen__TensorEvaluator_nopointers* eval_nopointers, global float* eval_ptr0, long eval_ptr0_offset, global float* eval_ptr1, long eval_ptr1_offset, int size, local int *scratch

	# what we need:
	# struct Eigen__TensorEvaluator_nopointers Note that none of the values we copy across are actually use, so we can just create a sufficiently large buffer...
	# global float *eval_ptr0 => this will receive the result. just create a sufficiently large buffer
	# ptr0_offset => 0
	# eval_ptr1 => will contian the data we want to reduce
	# eval_ptr1_offset=> 0
	# size => eg 10, to reduce 10 values
	# scratch => set to workgroupsize * sizeof(float)

	eval_nopointers_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)

	eval_ptr0 = np.zeros(1024, dtype=np.float32)
	eval_ptr0_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE \| cl.mem_flags.COPY_HOST_PTR, hostbuf=eval_ptr0)
	eval_ptr0_offset = 0

	eval_ptr1 = np.random.uniform(0, 1, size=(1024,)).astype(np.float32) + 1.0
	eval_ptr1_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE \| cl.mem_flags.COPY_HOST_PTR, hostbuf=eval_ptr1)
	eval_ptr1_offset = 0

	size = N

	global_size = 256
	workgroup_size = 256
	scratch = workgroup_size * 4

	print('running kernel...')
	prog.__getattr__('_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_'[:31])(
	q, (global_size,), (workgroup_size,),
	eval_nopointers_gpu,
	eval_ptr0_gpu, offset_type(eval_ptr0_offset),
	eval_ptr1_gpu, offset_type(eval_ptr1_offset),
	np.int32(size),
	> cl.LocalMemory(scratch)
	)

	test/tf/test_tf.py:108:
	_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
	/usr/local/lib/python3.5/dist-packages/pyopencl/__init__.py:995: in kernel_call
	return self._enqueue(self, queue, global_size, local_size, args, *kwargs)
	_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

	self = <pyopencl.cffi_cl.Kernel object at 0x7f22d079aef0>, queue = <pyopencl.cffi_cl.CommandQueue object at 0x7f22d07aa668>, global_size = (256,), local_size = (256,)
	arg0 = <pyopencl.cffi_cl.Buffer object at 0x7f22de3f1470>, arg1 = <pyopencl.cffi_cl.Buffer object at 0x7f22de3f1198>, arg2 = 0
	arg3 = <pyopencl.cffi_cl.Buffer object at 0x7f22d079ae80>, arg4 = 0, arg5 = 10, arg6 = <pyopencl.cffi_cl.LocalMemory object at 0x7f22d07d2f88>, global_offset = None
	g_times_l = None, wait_for = None

	> ???
	E pyopencl.cffi_cl.LogicError: when processing argument #3 (1-based): clsetkernelarg failed: INVALID_ARG_SIZE

	<generated function enqueue_knl__ZN5Eigen8internal15EigenMetaKe>:138: LogicError
	!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Interrupted: stopping after 1 failures !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
	====================================================================== 1 failed in 0.81 seconds ======================================================================