Kaushik Kulkarni kaushikcfd

This is the ELF corresponding to the mass matrix. The corresponding program can be found here. By comparison of the program and kernel we can easily identify the cmpexchg which are 3 in numbers at the end of the file. And so we that starting from .L_10 the iel loop starts, which means that all the floating point operations are in the line .L_10. `

//--------------------- .text.loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0 --------------------------
	.section	.text.loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0,"ax",@progbits
	.sectioninfo	@"SHI_REGISTERS=32"
	.align	32
.text.loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0:
 .type loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0,@function

Timing

Kernel	Loopy	PyOP2	MatFree
Mass	0.003	0.0148	0.15501
Laplace	0.008	0.0158	0.68635
Hyperelasticity	0.052	0.0344	0.25163

Loopy Kernel Statistics

Kernel | GFLOPs | Bandwidth | Total Time |

	#define lid(N) ((int) get_local_id(N))
	#define gid(N) ((int) get_group_id(N))
	#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
	#if __OPENCL_C_VERSION__ < 120
	#pragma OPENCL EXTENSION cl_khr_fp64: enable
	#endif

	__constant double const cnst[3] = { -1.0, 1.0, 0.0 };
	__constant double const cnst_0[3] = { -1.0, 0.0, 1.0 };

	#define lid(N) ((int) get_local_id(N))
	#define gid(N) ((int) get_group_id(N))
	#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
	#if __OPENCL_C_VERSION__ < 120
	#pragma OPENCL EXTENSION cl_khr_fp64: enable
	#endif

	__constant double const cnst[3 * 3] = { 0.6666666666666669, 0.16666666666666663, 0.16666666666666666, 0.16666666666666674, 0.16666666666666663, 0.6666666666666665, 0.16666666666666669, 0.6666666666666666, 0.16666666666666663 };
	__constant double const cnst_0[3] = { 0.16666666666666666, 0.16666666666666666, 0.16666666666666666 };

	#define lid(N) ((int) get_local_id(N))
	#define gid(N) ((int) get_group_id(N))
	#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
	#if __OPENCL_C_VERSION__ < 120
	#pragma OPENCL EXTENSION cl_khr_fp64: enable
	#endif

	__constant double const cnst[3] = { -1.0, 1.0, 0.0 };
	__constant double const cnst_0[3] = { -1.0, 0.0, 1.0 };

	#define lid(N) ((int) get_local_id(N))
	#define gid(N) ((int) get_group_id(N))
	#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
	#if __OPENCL_C_VERSION__ < 120
	#pragma OPENCL EXTENSION cl_khr_fp64: enable
	#endif

	__constant double const cnst[3 * 3] = { 0.6666666666666669, 0.16666666666666663, 0.16666666666666666, 0.16666666666666674, 0.16666666666666663, 0.6666666666666665, 0.16666666666666669, 0.6666666666666666, 0.16666666666666663 };
	__constant double const cnst_0[3] = { 0.16666666666666666, 0.16666666666666666, 0.16666666666666666 };

	static inline void form_cell_integral_otherwise (double A[1] , const double const restrict restrict coords , const double const restrict restrict w_0 , const double const restrict restrict w_1 )
	{
	static const double t0[3][3] = {{0.666666666666667, 0.166666666666667, 0.166666666666667},
	{0.166666666666667, 0.166666666666667, 0.666666666666667},
	{0.166666666666667, 0.666666666666667, 0.166666666666667}};
	double t1 = (-1 * coords[0][0]);
	double t2 = (-1 * coords[0][1]);
	double t3 = fabs(((t1 + coords[1][0]) * (t2 + coords[2][1])) + (-1 * ((t1 + coords[2][0]) * (t2 + coords[1][1]))));
	static const double t4[3] = {0.166666666666667, 0.166666666666667, 0.166666666666667};

	// PyOP2 kernel
	static inline void form00_cell_integral_otherwise (double A[3][3] , const double const restrict restrict coords )
	{
	double t19[4] ;
	double t20[4] ;
	double t0 = (-1 * coords[0][0]);
	double t1 = (t0 + coords[1][0]);
	double t2 = (-1 * coords[0][1]);
	double t3 = (t2 + coords[2][1]);
	double t4 = (t0 + coords[2][0]);

	===========================================================================
	failing kernel during pre-schedule check:
	===========================================================================
	---------------------------------------------------------------------------
	KERNEL: loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel
	---------------------------------------------------------------------------
	ARGUMENTS:
	A0_global: GlobalArg, type: np_atomic:dtype('float64'), shape: (A0_size), dim_tags: (N0:stride:1)
	A0_size: ValueArg, type: np:dtype('int32')
	A1_global: GlobalArg, type: np_atomic:dtype('float64'), shape: (A1_size, 2), dim_tags: (N1:stride:2, N0:stride:1)

	---------------------------------------------------------------------------
	KERNEL: tsfc_kernel
	---------------------------------------------------------------------------
	ARGUMENTS:
	A_0: GlobalArg, type: <runtime>, shape: (3), dim_tags: (N0:stride:1)
	A_1: GlobalArg, type: <runtime>, shape: (3), dim_tags: (N0:stride:1)
	A_2: GlobalArg, type: <runtime>, shape: (3), dim_tags: (N0:stride:1)
	coords: GlobalArg, type: <runtime>, shape: (3, 2), dim_tags: (N1:stride:2, N0:stride:1)
	w_0: GlobalArg, type: <runtime>, shape: (3, 1), dim_tags: (N1:stride:1, N0:stride:1)
	w_1: GlobalArg, type: <runtime>, shape: (3, 2), dim_tags: (N1:stride:2, N0:stride:1)