knl = lp.make_kernel(
        "{ [i, j, k, k1]: 0<=i, j, k, k1<256 }",
        """
        temp_cnst[k] = 2.0 {id=insn_1}
        temp_cnst_2[k1] = 2*temp_cnst[k1] {id=insn_4}
        c[i, j] = reduce(sum, k, a[i,k]*b[k,j]*temp_cnst_2[k]) {id=insn_2}
        c[i, j] = reduce(sum, k1, a[i,k1]*b[k1,j]*temp_cnst_2[k1]) {id=insn_3}
 """)
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | --------------------------------------------------------------------------- | |
| KERNEL: loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel | |
| --------------------------------------------------------------------------- | |
| ARGUMENTS: | |
| A0_global: GlobalArg, type: np_atomic:dtype('float64'), shape: (A0_size), dim_tags: (N0:stride:1) | |
| A0_size: ValueArg, type: np:dtype('int32') | |
| A1_global: GlobalArg, type: np_atomic:dtype('float64'), shape: (A1_size, 2), dim_tags: (N1:stride:2, N0:stride:1) | |
| A1_size: ValueArg, type: np:dtype('int32') | |
| coords_global: GlobalArg, type: np:dtype('float64'), shape: (coords_global_len, 2), dim_tags: (N1:stride:2, N0:stride:1) | |
| coords_global_len: ValueArg, type: np:dtype('int32') | 
static inline void form_cell_integral_otherwise (double  A[18] , const double *const restrict *restrict coords , const double *const restrict *restrict w_0 , const double *const restrict *restrict w_1 , const double *const restrict *restrict w_2 , const double *restrict w_3 , const double *restrict w_4 , const double *restrict w_5 , const double *const restrict *restrict w_6 , const double *const restrict *restrict w_7 , const double *const restrict *restrict w_8 )
{
  double  t26[7] ;
  double  t27[7] ;
  double  t28[7] ;
  double  t29[7] ;
  double  t30[7] ;
  double  t33[3] ;
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import numpy as np | |
| import loopy as lp | |
| import pyopencl as cl | |
| import pyopencl.array | |
| import pyopencl.clrandom as cl_random | |
| from time import time | |
| ctx = cl.create_some_context() | |
| queue = cl.CommandQueue(ctx) | 
We had our earlier scatter kernel as follows:
coords[iel, i_bf, i_dim] = coords_global[ltg[iel, i_bf], i_dim]And coords being implemented as a 3-dimensional array in the element kernel. This was not a problem.
But the "new" tsfc element kernel, flatten out the 3rd dimension of coords. Making the the coords a 2-dimensional array in the element kernel.
Hence, I would need to change the scatter kernel. So I tried 2 approaches to this problem:
- Making the new scatter kernel look like:
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | --------------------------------------------------------------------------- | |
| KERNEL: loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel | |
| --------------------------------------------------------------------------- | |
| ARGUMENTS: | |
| A0_global: GlobalArg, type: np_atomic:dtype('float64'), shape: (A0_size), dim_tags: (N0:stride:1) | |
| A0_size: ValueArg, type: np:dtype('int32') | |
| coords_global: GlobalArg, type: np:dtype('float64'), shape: (coords_global_len, 2), dim_tags: (N1:stride:2, N0:stride:1) | |
| coords_global_len: ValueArg, type: np:dtype('int32') | |
| ltg_0: GlobalArg, type: np:dtype('int32'), shape: (nelements, 3), dim_tags: (N1:stride:3, N0:stride:1) | |
| ltg_1: GlobalArg, type: np:dtype('int32'), shape: (nelements, 3), dim_tags: (N1:stride:3, N0:stride:1) | 
- (Old) Loopy Kernel:
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) tsfc_kernel(__global double *__restrict__ A_0, __global double const *__restrict__ coords, __global double const *__restrict__ w_0){
  double acc_i12;
  double cse;
  double cse_0;
  cse_0 = -1.0 * coords[1];
  cse = -1.0 * coords[0];