Created
December 8, 2017 21:28
-
-
Save kaushikcfd/53cf6824bf9fdd261558fd5416230ee2 to your computer and use it in GitHub Desktop.
This is the serial version of the mass matrix.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #define lid(N) ((int) get_local_id(N)) | |
| #define gid(N) ((int) get_group_id(N)) | |
| #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable | |
| #if __OPENCL_C_VERSION__ < 120 | |
| #pragma OPENCL EXTENSION cl_khr_fp64: enable | |
| #endif | |
| __constant double const cnst[3 * 3] = { 0.6666666666666669, 0.16666666666666663, 0.16666666666666666, 0.16666666666666674, 0.16666666666666663, 0.6666666666666665, 0.16666666666666669, 0.6666666666666666, 0.16666666666666663 }; | |
| __constant double const cnst_0[3] = { 0.16666666666666666, 0.16666666666666666, 0.16666666666666666 }; | |
| __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel(int const w_0_global_len, __global double const *__restrict__ w_0_global, __global int const *__restrict__ ltg_1, int const nelements, int const coords_global_len, __global double const *__restrict__ coords_global, __global int const *__restrict__ ltg_0, __global volatile double *__restrict__ A0_global, int const A0_size) | |
| { | |
| for (int i_init_0 = 0; i_init_0 <= 1088; ++i_init_0) | |
| A0_global[i_init_0] = 0.0; | |
| } | |
| __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0(int const w_0_global_len, __global double const *__restrict__ w_0_global, __global int const *__restrict__ ltg_1, int const nelements, int const coords_global_len, __global double const *__restrict__ coords_global, __global int const *__restrict__ ltg_0, __global volatile double *__restrict__ A0_global, int const A0_size) | |
| { | |
| double acc_i12; | |
| double sum_tmp_0[3]; | |
| for (int iel = 0; iel <= 2047; ++iel) | |
| { | |
| for (int i1_0 = 0; i1_0 <= 2; ++i1_0) | |
| { | |
| acc_i12 = 0.0; | |
| for (int i12 = 0; i12 <= 2; ++i12) | |
| acc_i12 = acc_i12 + cnst[3 * i12 + i1_0] * (cnst[3 * i12 + 2] * w_0_global[ltg_1[3 * iel + 2]] + cnst[3 * i12] * w_0_global[ltg_1[3 * iel]] + cnst[3 * i12 + 1] * w_0_global[ltg_1[3 * iel + 1]]) * cnst_0[i12] * fabs((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])); | |
| sum_tmp_0[i1_0] = acc_i12; | |
| } | |
| for (int ibf_gather_0 = 0; ibf_gather_0 <= 2; ++ibf_gather_0) | |
| { | |
| double loopy_old_val; | |
| double loopy_new_val; | |
| do | |
| { | |
| loopy_old_val = A0_global[ltg_1[3 * iel + ibf_gather_0]]; | |
| loopy_new_val = loopy_old_val + sum_tmp_0[ibf_gather_0]; | |
| } | |
| while (atom_cmpxchg((__global long *) &(A0_global[ltg_1[3 * iel + ibf_gather_0]]), *(long *) &loopy_old_val, *(long *) &loopy_new_val) != *(long *) &loopy_old_val); | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For this kernel:
nelements=32*32*2=2048A0_size=1088