Skip to content

Instantly share code, notes, and snippets.

@kaushikcfd
Created December 6, 2017 20:05
Show Gist options
  • Save kaushikcfd/f2412138d014901575e68471623ee97e to your computer and use it in GitHub Desktop.
Save kaushikcfd/f2412138d014901575e68471623ee97e to your computer and use it in GitHub Desktop.
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
__constant double const cnst[3] = { -1.0, 1.0, 0.0 };
__constant double const cnst_0[3] = { -1.0, 0.0, 1.0 };
__kernel void __attribute__ ((reqd_work_group_size(32, 1, 1))) loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel(int const w_0_global_len, __global double const *__restrict__ w_0_global, __global int const *__restrict__ ltg_1, int const nelements, int const coords_global_len, __global double const *__restrict__ coords_global, __global int const *__restrict__ ltg_0, __global volatile double *__restrict__ A0_global, int const A0_size)
{
if (66048 + -32 * gid(0) + -1 * lid(0) >= 0)
A0_global[32 * gid(0) + lid(0)] = 0.0;
}
__kernel void __attribute__ ((reqd_work_group_size(32, 1, 1))) loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0(int const w_0_global_len, __global double const *__restrict__ w_0_global, __global int const *__restrict__ ltg_1, int const nelements, int const coords_global_len, __global double const *__restrict__ coords_global, __global int const *__restrict__ ltg_0, __global volatile double *__restrict__ A0_global, int const A0_size)
{
for (int ibf_gather_0 = 0; ibf_gather_0 <= 2; ++ibf_gather_0)
{
double loopy_old_val;
double loopy_new_val;
do
{
loopy_old_val = A0_global[ltg_1[3 * (32 * gid(0) + lid(0)) + ibf_gather_0]];
loopy_new_val = loopy_old_val + cnst[ibf_gather_0] * (((-1.0 * w_0_global[ltg_1[3 * (32 * gid(0) + lid(0))]] + w_0_global[ltg_1[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1])) + (-1.0 * w_0_global[ltg_1[3 * (32 * gid(0) + lid(0))]] + w_0_global[ltg_1[3 * (32 * gid(0) + lid(0)) + 2]]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1])) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1])) + ((-1.0 * w_0_global[ltg_1[3 * (32 * gid(0) + lid(0))]] + w_0_global[ltg_1[3 * (32 * gid(0) + lid(0)) + 1]]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) + (-1.0 * w_0_global[ltg_1[3 * (32 * gid(0) + lid(0))]] + w_0_global[ltg_1[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1]))) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]])) * 0.5 * fabs((-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1])) + cnst_0[ibf_gather_0] * (((-1.0 * w_0_global[ltg_1[3 * (32 * gid(0) + lid(0))]] + w_0_global[ltg_1[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1])) + (-1.0 * w_0_global[ltg_1[3 * (32 * gid(0) + lid(0))]] + w_0_global[ltg_1[3 * (32 * gid(0) + lid(0)) + 2]]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1])) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1]) + ((-1.0 * w_0_global[ltg_1[3 * (32 * gid(0) + lid(0))]] + w_0_global[ltg_1[3 * (32 * gid(0) + lid(0)) + 1]]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) + (-1.0 * w_0_global[ltg_1[3 * (32 * gid(0) + lid(0))]] + w_0_global[ltg_1[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1]))) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1]))) * 0.5 * fabs((-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))]] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0))] + 1] + coords_global[2 * ltg_0[3 * (32 * gid(0) + lid(0)) + 1] + 1]));
}
while (atom_cmpxchg((__global long *) &(A0_global[ltg_1[3 * (32 * gid(0) + lid(0)) + ibf_gather_0]]), *(long *) &loopy_old_val, *(long *) &loopy_new_val) != *(long *) &loopy_old_val);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment