Skip to content

Instantly share code, notes, and snippets.

@kaushikcfd
Last active February 8, 2018 01:54
Show Gist options
  • Save kaushikcfd/4fb7434807be01c9b2d5f4ecb08dd951 to your computer and use it in GitHub Desktop.
Save kaushikcfd/4fb7434807be01c9b2d5f4ecb08dd951 to your computer and use it in GitHub Desktop.
  • (Old) Loopy Kernel:
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) tsfc_kernel(__global double *__restrict__ A_0, __global double const *__restrict__ coords, __global double const *__restrict__ w_0){
  double acc_i12;
  double cse;
  double cse_0;

  cse_0 = -1.0 * coords[1];
  cse = -1.0 * coords[0];
  for (int i1 = 0; i1 <= 2; ++i1){
    acc_i12 = 0.0;
    for (int i12 = 0; i12 <= 2; ++i12)
      acc_i12 = acc_i12 + cnst[3 * i12 + i1] * (cnst[3 * i12 + 2] * w_0[2] + cnst[3 * i12] * w_0[0] + cnst[3 * i12 + 1] * w_0[1]) * cnst_0[i12] * fabs((cse + coords[2]) * (cse_0 + coords[5]) + -1.0 * (cse + coords[4]) * (cse_0 + coords[3]));
    A_0[i1] = acc_i12;
  }
}
  • MatFree kernel
static inline void form00_cell_integral_otherwise (double  A[3][3] , const double *const restrict *restrict coords ){
  static const double  t0[3][4]  = {{0.666666666666667, 0.166666666666667, 0.166666666666667}, 
  {0.166666666666667, 0.166666666666667, 0.666666666666667}, 
  {0.166666666666667, 0.666666666666667, 0.166666666666667}};
  double  t1  = (-1 * coords[0][0]);
  double  t2  = (-1 * coords[0][1]);
  double  t3  = fabs(((t1 + coords[1][0]) * (t2 + coords[2][1])) + (-1 * ((t1 + coords[2][0]) * (t2 + coords[1][1]))));
  static const double  t4[4]  = {0.166666666666667, 0.166666666666667, 0.166666666666667};
  
  for (int  ip  = 0; ip < 3; ip += 1){
    double  t5  = (t4[ip] * t3);
    for (int  j  = 0; j < 3; j += 1){
      double  t6  = (t0[ip][j] * t5);   
      for (int  k  = 0; k < 3; k += 1){
        #pragma coffee expression
        A[j][k] += t0[ip][k] * t6;      
      }
    }
  }
}
  • New Loopy Kernel after taking into account the extra CSE's which we were dropping:
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) tsfc_kernel(__global double *__restrict__ A_0, __global double const *__restrict__ coords, __global double const *__restrict__ w_0)
{
  double acc_i12;
  double cse;
  double cse_0;
  double cse_1;
  double cse_2[3];
  double cse_3[3];
  double cse_4[3];
  double cse_5[3];
  double cse_6[3];
  double cse_7[3];
  double cse_8[3];
  double cse_9[3 * 3];

  cse_0 = -1.0 * coords[1];
  cse = -1.0 * coords[0];
  cse_1 = fabs((cse + coords[2]) * (cse_0 + coords[5]) + -1.0 * (cse + coords[4]) * (cse_0 + coords[3]));
  for (int i1 = 0; i1 <= 2; ++i1)
  {
    acc_i12 = 0.0;
    for (int i12 = 0; i12 <= 2; ++i12)
    {
      cse_6[i12] = cnst[3 * i12 + 2] * w_0[2];
      cse_4[i12] = cnst[3 * i12 + 1] * w_0[1];
      cse_3[i12] = cnst[3 * i12] * w_0[0];
      cse_5[i12] = cse_3 + cse_4;
      cse_7[i12] = cse_5 + cse_6;
      cse_2[i12] = cnst_0[i12] * cse_1;
      cse_8[i12] = cse_2 * cse_7;
      cse_9[3 * i12 + i1] = cnst[3 * i12 + i1] * cse_8;
      acc_i12 = acc_i12 + cse_9;
    }
    A_0[i1] = acc_i12;
  }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment