__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) tsfc_kernel(__global double *__restrict__ A_0, __global double const *__restrict__ coords, __global double const *__restrict__ w_0){
double acc_i12;
double cse;
double cse_0;
cse_0 = -1.0 * coords[1];
cse = -1.0 * coords[0];
for (int i1 = 0; i1 <= 2; ++i1){
acc_i12 = 0.0;
for (int i12 = 0; i12 <= 2; ++i12)
acc_i12 = acc_i12 + cnst[3 * i12 + i1] * (cnst[3 * i12 + 2] * w_0[2] + cnst[3 * i12] * w_0[0] + cnst[3 * i12 + 1] * w_0[1]) * cnst_0[i12] * fabs((cse + coords[2]) * (cse_0 + coords[5]) + -1.0 * (cse + coords[4]) * (cse_0 + coords[3]));
A_0[i1] = acc_i12;
}
}
static inline void form00_cell_integral_otherwise (double A[3][3] , const double *const restrict *restrict coords ){
static const double t0[3][4] = {{0.666666666666667, 0.166666666666667, 0.166666666666667},
{0.166666666666667, 0.166666666666667, 0.666666666666667},
{0.166666666666667, 0.666666666666667, 0.166666666666667}};
double t1 = (-1 * coords[0][0]);
double t2 = (-1 * coords[0][1]);
double t3 = fabs(((t1 + coords[1][0]) * (t2 + coords[2][1])) + (-1 * ((t1 + coords[2][0]) * (t2 + coords[1][1]))));
static const double t4[4] = {0.166666666666667, 0.166666666666667, 0.166666666666667};
for (int ip = 0; ip < 3; ip += 1){
double t5 = (t4[ip] * t3);
for (int j = 0; j < 3; j += 1){
double t6 = (t0[ip][j] * t5);
for (int k = 0; k < 3; k += 1){
#pragma coffee expression
A[j][k] += t0[ip][k] * t6;
}
}
}
}
- New Loopy Kernel after taking into account the extra CSE's which we were dropping:
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) tsfc_kernel(__global double *__restrict__ A_0, __global double const *__restrict__ coords, __global double const *__restrict__ w_0)
{
double acc_i12;
double cse;
double cse_0;
double cse_1;
double cse_2[3];
double cse_3[3];
double cse_4[3];
double cse_5[3];
double cse_6[3];
double cse_7[3];
double cse_8[3];
double cse_9[3 * 3];
cse_0 = -1.0 * coords[1];
cse = -1.0 * coords[0];
cse_1 = fabs((cse + coords[2]) * (cse_0 + coords[5]) + -1.0 * (cse + coords[4]) * (cse_0 + coords[3]));
for (int i1 = 0; i1 <= 2; ++i1)
{
acc_i12 = 0.0;
for (int i12 = 0; i12 <= 2; ++i12)
{
cse_6[i12] = cnst[3 * i12 + 2] * w_0[2];
cse_4[i12] = cnst[3 * i12 + 1] * w_0[1];
cse_3[i12] = cnst[3 * i12] * w_0[0];
cse_5[i12] = cse_3 + cse_4;
cse_7[i12] = cse_5 + cse_6;
cse_2[i12] = cnst_0[i12] * cse_1;
cse_8[i12] = cse_2 * cse_7;
cse_9[3 * i12 + i1] = cnst[3 * i12 + i1] * cse_8;
acc_i12 = acc_i12 + cse_9;
}
A_0[i1] = acc_i12;
}
}