Mezzano · August 23, 2016 15:15
diff --git a/CLBLAST runtest b/CLBLAST runtest
 Setting to use device 0
 Build Status = -2 ( Err = -11 )
 Log: (1387:0) : error : atomic function "atomic_cmpxchg" not supported

 Sources: #ifndef __OPENCL_VERSION__
 #define __kernel
 #define __global
 #define __constant
 #define __local
 #define get_global_id(x) 0
 #define get_global_size(x) 0
 #define get_local_id(x) 0
 #define get_local_size(x) 0
 #define FLT_MAX 0
 #define FLT_MIN 0
 #define cl_khr_fp64
 #define cl_amd_fp64
 #define DOUBLE_SUPPORT_AVAILABLE
 #define CLK_LOCAL_MEM_FENCE
 #define CLK_GLOBAL_MEM_FENCE
 #define Dtype float
 #define barrier(x)
 #define atomic_cmpxchg(x, y, z) x
 #define signbit(x) x
 #define int_tp long
 #define uint_tp unsigned long
 #define int_tpc long
 #define uint_tpc unsigned long
 #endif

 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)

 #define TYPE_FLOAT 1
 #define TYPE_DOUBLE 2

 #if defined(cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #define DOUBLE_SUPPORT_AVAILABLE
 #elif defined(cl_amd_fp64)
 #pragma OPENCL EXTENSION cl_amd_fp64 : enable
 #define DOUBLE_SUPPORT_AVAILABLE
 #endif

 #if defined(cl_khr_int64_base_atomics)
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
 #define ATOMICS_64_AVAILABLE
 #endif

 // Types used for parameters, offset computations and so on
 #define int_tp int
 #define uint_tp unsigned int

 // Definitions used to cast the types above as needed
 #define int_tpc int
 #define uint_tpc unsigned int

 #define Dtype float

 #define Dtype2 float2

 #define Dtype4 float4

 #define Dtype8 float8

 #define Dtype16 float16

 #define TYPE TYPE_FLOAT

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(relu_forward,Dtype)(const int_tp n,
                                           __global const Dtype* in,
                                           __global Dtype* out,
                                           Dtype negative_slope) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;
  }
 }

 __kernel void TEMPLATE(relu_backward,Dtype)(const int_tp n,
                                            __global const Dtype* in_diff,
                                            __global const Dtype* in_data,
                                            __global Dtype* out_diff,
                                            Dtype negative_slope) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out_diff[index] = in_diff[index]
        * ((in_data[index] > 0?1.0:0.0) + (in_data[index] <= 0?1.0:0.0) * negative_slope);
  }
 }

 __kernel void TEMPLATE(tanh_forward,Dtype)(const int_tp n,
                                           __global const Dtype* in,
                                           __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out[index] = tanh(in[index]);
  }
 }

 __kernel void TEMPLATE(tanh_backward,Dtype)(const int_tp n,
                                            __global const Dtype* in_diff,
                                            __global const Dtype* out_data,
                                            __global Dtype* out_diff) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    Dtype tanhx = out_data[index];
    out_diff[index] = in_diff[index] * (1 - tanhx * tanhx);
  }
 }

 __kernel void TEMPLATE(sigmoid_forward,Dtype)(const int_tp n,
                                              __global const Dtype* in,
                                              __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out[index] = 1.0 / (1.0 + exp(-in[index]));
  }
 }

 __kernel void TEMPLATE(sigmoid_backward,Dtype)(const int_tp n,
                                               __global const Dtype* in_diff,
                                               __global const Dtype* out_data,
                                               __global Dtype* out_diff) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    const Dtype sigmoid_x = out_data[index];
    out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
  }
 }

 __kernel void TEMPLATE(threshold,Dtype)(const int_tp n, const Dtype threshold,
                                        __global const Dtype* in,
                                        __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out[index] = in[index] > threshold ? 1.0 : 0.0;
  }
 }

 __kernel void TEMPLATE(prelu_forward,Dtype)(const int_tp n, const int_tp channels,
                                            const int_tp dim,
                                            __global const Dtype* in,
                                            __global Dtype* out,
                                            __global const Dtype* slope_data,
                                            const int_tp div_factor) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    int_tp c = (index / dim) % channels / div_factor;
    out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
  }
 }

 __kernel void TEMPLATE(prelu_backward,Dtype)(const int_tp n, const int_tp channels,
                                             const int_tp dim,
                                             __global const Dtype* in_diff,
                                             __global const Dtype* in_data,
                                             __global Dtype* out_diff,
                                             __global const Dtype* slope_data,
                                             const int_tp div_factor) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    int_tp c = (index / dim) % channels / div_factor;
    out_diff[index] = in_diff[index]
        * ((in_data[index] > 0?1.0:0.0) + (in_data[index] <= 0?1.0:0.0) * slope_data[c]);
  }
 }

 __kernel void TEMPLATE(prelu_param_backward,Dtype)(const int_tp n, const int_tp rows,
                                                   const int_tp rowPitch,
                                                   __global const Dtype* in_diff,
                                                   __global const Dtype* in_data,
                                                   __global Dtype* out_diff) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0?1.0:0.0);
    for (int k = 1; k < rows; k++) {
      out_diff[index] += in_diff[index + k * rowPitch]
          * in_data[index + k * rowPitch]
          * (in_data[index + k * rowPitch] <= 0?1.0:0.0);
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(gpu_set,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* y) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[index] = alpha;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(br_forward,Dtype)(const int_tp count, const int_tp inner_dim,
                                         __global const Dtype* in,
                                         __global const Dtype* permut,
                                         __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < count;
      index += get_global_size(0)) {
    int_tp n = index / (inner_dim);
    int_tp in_n = (int_tp) (permut[n]);
    out[index] = in[in_n * (inner_dim) + index % (inner_dim)];
  }
 }

 __kernel void TEMPLATE(br_backward,Dtype)(const int_tp count, const int_tp inner_dim,
                                          __global const Dtype* in,
                                          __global const Dtype* top_indexes,
                                          __global const Dtype* begins,
                                          __global const Dtype* counts,
                                          __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < count;
      index += get_global_size(0)) {
    int_tp n = index / (inner_dim);
    out[index] = 0;
    int_tp lower = (int_tp) (begins[n]);
    int_tp upper = lower + (int_tp) (counts[n]);
    for (int_tp i = lower; i < upper; ++i) {
      int_tp in_n = (int_tp) (top_indexes[i]);
      out[index] += in[in_n * (inner_dim) + index % (inner_dim)];
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(null_kernel,Dtype)(Dtype arg) {
  Dtype out = arg;
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(bias_forward,Dtype)(const int_tp n,
                                           __global const Dtype* in,
                                           __global const Dtype* bias,
                                           const int_tp bias_dim,
                                           const int_tp inner_dim,
                                           __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n;
      index += get_global_size(0)) {
    const int_tp bias_index = (index / inner_dim) % bias_dim;
    out[index] = in[index] + bias[bias_index];
  }
 }

 __kernel void TEMPLATE(scale_forward,Dtype)(const int_tp n,
                                            __global const Dtype* in,
                                            __global const Dtype* scale,
                                            const int_tp scale_dim,
                                            const int_tp inner_dim,
                                            __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n;
      index += get_global_size(0)) {
    const int_tp scale_index = (index / inner_dim) % scale_dim;
    out[index] = in[index] * scale[scale_index];
  }
 }

 __kernel void TEMPLATE(scale_bias_forward,Dtype)(const int_tp n,
                                                 __global const Dtype* in,
                                                 __global const Dtype* scale,
                                                 __global const Dtype* bias,
                                                 const int_tp scale_dim,
                                                 const int_tp inner_dim,
                                                 __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n;
      index += get_global_size(0)) {
    const int_tp scale_index = (index / inner_dim) % scale_dim;
    out[index] = in[index] * scale[scale_index] + bias[scale_index];
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(bnll_forward,Dtype)(const int_tp n,
                                           __global const Dtype* in,
                                           __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    if (in[index] > 0.0f) {
      out[index] = in[index] + log((Dtype) (1.0 + exp(-in[index])));
    } else {
      out[index] = log((Dtype) (1.0 + exp(in[index])));
    }
  }
 }

 __kernel void TEMPLATE(bnll_backward,Dtype)(const int_tp n,
                                            __global const Dtype* in_diff,
                                            __global const Dtype* in_data,
                                            __global Dtype* out_diff) {
  Dtype kBNLL_THRESHOLD = 50.;
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    Dtype expval = exp(min(in_data[index], kBNLL_THRESHOLD));
    out_diff[index] = in_diff[index] * expval / (expval + 1.);
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(kernel_channel_max,Dtype)(const int_tp num, const int_tp channels,
                                   const int_tp spatial_dim,
                                   __global const Dtype* data,
                                   __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < num * spatial_dim; index +=
      get_global_size(0)) {
    int_tp n = index / spatial_dim;
    int_tp s = index % spatial_dim;
    float maxval = -FLT_MAX;
    for (int_tp c = 0; c < channels; ++c) {
      maxval = max((Dtype)(data[(n * channels + c) * spatial_dim + s]), (Dtype)maxval);
    }
    out[index] = maxval;
  }
 }

 __kernel void TEMPLATE(kernel_channel_subtract,Dtype)(const int_tp count, const int_tp num,
                                        const int_tp channels,
                                        const int_tp spatial_dim,
                                        __global const Dtype* channel_max,
                                        __global Dtype* data) {
  for (int_tp index = get_global_id(0); index < count;
      index += get_global_size(0)) {
    int_tp n = index / channels / spatial_dim;
    int_tp s = index % spatial_dim;
    data[index] -= channel_max[n * spatial_dim + s];
  }
 }

 __kernel void TEMPLATE(kernel_exp,Dtype)(const int_tp count, __global const Dtype* data,
                           __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < count;
      index += get_global_size(0)) {
    out[index] = exp(data[index]);
  }
 }

 __kernel void TEMPLATE(kernel_channel_sum,Dtype)(const int_tp num, const int_tp channels,
                                   const int_tp spatial_dim,
                                   __global const Dtype* data,
                                   __global Dtype* channel_sum) {
  for (int_tp index = get_global_id(0); index < num * spatial_dim; index +=
      get_global_size(0)) {
    int_tp n = index / spatial_dim;
    int_tp s = index % spatial_dim;
    Dtype sum = 0;
    for (int_tp c = 0; c < channels; ++c) {
      sum += data[(n * channels + c) * spatial_dim + s];
    }
    channel_sum[index] = sum;
  }
 }

 __kernel void TEMPLATE(kernel_channel_div,Dtype)(const int_tp count, const int_tp num,
                                   const int_tp channels, const int_tp spatial_dim,
                                   __global const Dtype* channel_sum,
                                   __global Dtype* data) {
  for (int_tp index = get_global_id(0); index < count;
      index += get_global_size(0)) {
    int_tp n = index / channels / spatial_dim;
    int_tp s = index % spatial_dim;
    data[index] /= channel_sum[n * spatial_dim + s];
  }
 }

 __kernel void TEMPLATE(kernel_channel_dot,Dtype)(const int_tp num, const int_tp channels,
                                   const int_tp spatial_dim,
                                   __global const Dtype* data_1,
                                   __global const Dtype* data_2,
                                   __global Dtype* channel_dot) {
  for (int_tp index = get_global_id(0); index < num * spatial_dim; index +=
      get_global_size(0)) {
    int_tp n = index / spatial_dim;
    int_tp s = index % spatial_dim;
    Dtype dot = 0;
    for (int_tp c = 0; c < channels; ++c) {
      dot += (data_1[(n * channels + c) * spatial_dim + s]
          * data_2[(n * channels + c) * spatial_dim + s]);
    }
    channel_dot[index] = dot;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(concat,Dtype)(const int_tp nthreads, __global const Dtype* in_data,
                                     const int forward, const int_tp num_concats,
                                     const int_tp concat_size,
                                     const int_tp top_concat_axis,
                                     const int_tp bottom_concat_axis,
                                     const int_tp offset_concat_axis,
                                     __global Dtype* out_data) {

  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp total_concat_size = concat_size * bottom_concat_axis;
    const int_tp concat_num = index / total_concat_size;
    const int_tp concat_index = index % total_concat_size;
    const int_tp top_index = concat_index
        + (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
    if (forward == 1) {
      out_data[top_index] = in_data[index];
    } else {
      out_data[index] = in_data[top_index];
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(cll_backward,Dtype)(const int_tp count, const int_tp channels,
                            const Dtype margin, const Dtype alpha, __global const Dtype* y,
                            __global const Dtype* diff, __global const Dtype* dist_sq,
                            __global Dtype *bottom_diff) {
  for (int_tp i = get_global_id(0); i < count;
      i += get_global_size(0)) {
    int_tp n = i / channels;  // the num index, to access y and dist_sq
    if (trunc(y[n]) != 0.) {  // similar pairs
      bottom_diff[i] = alpha * diff[i];
    } else {  // dissimilar pairs
      Dtype mdist = 0.;
      Dtype beta = 0.;
      Dtype dist = sqrt(dist_sq[n]);
      mdist = (margin - dist);
      beta = -alpha * mdist / (dist + 1e-4) * diff[i];
      if (mdist > 0.) {
        bottom_diff[i] = beta;
      } else {
        bottom_diff[i] = 0;
      }
    }
  }
 }

 __kernel void TEMPLATE(cll_backward_legacy,Dtype)(const int count, const int channels,
    const Dtype margin, const Dtype alpha, __global Dtype* y,
    __global Dtype* diff, __global Dtype* dist_sq,
    __global Dtype* bottom_diff) {
    for (int_tp i = get_global_id(0); i < count;
      i += get_global_size(0)) {
    int n = i / channels;  // the num index, to access y and dist_sq
    if (trunc(y[n]) != 0.) {  // similar pairs
      bottom_diff[i] = alpha * diff[i];
    } else {  // dissimilar pairs
      Dtype mdist = 0.;
      Dtype beta = 0.;
      mdist = (margin - dist_sq[n]);
      beta = -alpha;
      if (mdist > 0.) {
        bottom_diff[i] = beta;
      } else {
        bottom_diff[i] = 0;
      }
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(conv_layer_spatial_phony,Dtype)(Dtype arg) {
  Dtype out = arg;
 }

 #define __CAT(x, y) x##y
 #define CAT(x, y) __CAT(x, y)
 #define LOOP1(VAR, STMT) (STMT); (VAR)++;
 #define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;
 #define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;
 #define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;
 #define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;
 #define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;
 #define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;
 #define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;
 #define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;
 #define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;
 #define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;
 #define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;
 #define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;
 #define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++;
 #define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++;
 #define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++;
 #define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))

 #ifdef MULTI
 __kernel void CFMulti(__global Dtype* image_data, int_tp image_offset,
    __global Dtype* kernel_data, int_tp kernel_offset,
    __global Dtype* bias,const int_tp bias_offset,
    __global Dtype* convolved_image,const int_tp convolved_image_offset,
    const ushort WIDTH,
    const ushort HEIGHT,
    const ushort OUTPUT_W,
    const ushort OUTPUT_H) {

  const int_tp outputX = get_global_id(0);
  const int_tp outputY = get_global_id(1);
  const int_tp kernelNum = get_global_id(2)*ZPAR;
  if(outputX < OUTPUT_W && outputY < OUTPUT_H)
  {
    Dtype sum[ZPAR];
    Dtype4 vectorSum[ZPAR];
    for(int_tp kern =0; kern < ZPAR; kern++)
    {
      sum[kern] = 0.0f;
      vectorSum[kern] = (0.0f,0.0f,0.0f,0.0f);
    }

    const int_tp currentKernelOffset = kernel_offset + kernelNum*KERNEL_H*KERNEL_W*CHANNELS;
    const int_tp biasIndex=bias_offset + kernelNum;
    const int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W;
    const int_tp imageSize = WIDTH*HEIGHT;
    const int_tp float4Reads = KERNEL_W / 4;
    const int_tp floatReads = KERNEL_W % 4;
    Dtype4 imageCache;

    __global Dtype* image_dataPtrFloat = (image_data + (image_offset + local_image_offset));
    __global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));

    for(int_tp c = 0; c < CHANNELS; c++)
    {
      for(int_tp y = 0; y < KERNEL_H; y++)
      {

        for(int_tp x=0; x< float4Reads; x++)
        {
          imageCache = ((__global Dtype4*)image_dataPtrFloat)[x];
          for(int_tp kern =0; kern < ZPAR; kern++)
          {
            vectorSum[kern] += imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[x];
          }
        }

        if(floatReads == 1)
        {
          imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads];
          for(int_tp kern =0; kern < ZPAR; kern++)
          vectorSum[kern].s0 += ( imageCache * ( (__global Dtype4*) &(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]) )[float4Reads] ).s0;
        }
        else if(floatReads == 2)
        {
          imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads];
          for(int_tp kern =0; kern < ZPAR; kern++)
          vectorSum[kern].s01 += (imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[float4Reads]).s01;
        }
        else if(floatReads == 3)
        {
          imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads];
          for(int_tp kern =0; kern < ZPAR; kern++)
          vectorSum[kern].s012 += (imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[float4Reads]).s012;
        }

        image_dataPtrFloat += WIDTH;
        kernel_dataPtrFloat += KERNEL_W;
      }
      image_dataPtrFloat += imageSize - WIDTH*KERNEL_H;
    }
    for(int_tp kern =0; kern < ZPAR; kern++)
    sum[kern] = vectorSum[kern].x + vectorSum[kern].y + vectorSum[kern].z + vectorSum[kern].w;

    if(APPLY_BIAS == 1)
    {
      for(int_tp kern = 0; kern < ZPAR; kern++)
      if(kernelNum+kern < OUTPUT_Z)
      convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX] =
      sum[kern] + bias[biasIndex +kern];
    }
    else
    for(int_tp kern = 0; kern < ZPAR; kern++)
    if(kernelNum+kern < OUTPUT_Z)
    convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX] = sum[kern];
  }
 }

 #endif


 #ifdef MULTI_11
 __kernel void CFMulti_11_11_4(__global Dtype* image_data, int_tp image_offset,
    __global Dtype* kernel_data, int_tp kernel_offset,
    __global Dtype* bias,const int_tp bias_offset,
    __global Dtype* convolved_image,const int_tp convolved_image_offset,
    const ushort WIDTH,
    const ushort HEIGHT,
    const ushort OUTPUT_W,
    const ushort OUTPUT_H) {

  int_tp outputX = get_global_id(0)*XPAR;
  int_tp outputY = get_global_id(1)*YPAR;
  int_tp kernelNum = get_global_id(2)*ZPAR;
  if(outputX < OUTPUT_W && outputY < OUTPUT_H)
  {
    Dtype sum[XPAR*YPAR*ZPAR];
    for(int_tp kern =0; kern < XPAR*YPAR*ZPAR; kern++)
    {
      sum[kern] = 0.0f;
    }

    int_tp currentKernelOffset = kernel_offset + kernelNum*KERNELSIZE*CHANNELS;
    int_tp biasIndex=bias_offset + kernelNum;
    int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W;
    int_tp imageSize = WIDTH*HEIGHT;
    int_tp index;

    __global Dtype* image_dataPtrFloat = (image_data + (image_offset + local_image_offset));
    __global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));

    Dtype16 imageCache;
    Dtype8 imageCacheR;
    Dtype8 kernelCache;
    Dtype4 kernelCacheR;

    for(int_tp c = 0; c < CHANNELS; c++)
    {
      for(int_tp y = 0; y < 11; y++)
      {
        imageCache = ((__global Dtype16*)image_dataPtrFloat)[0];
        imageCacheR =((__global Dtype8*)image_dataPtrFloat)[2];

        for(int_tp kern =0; kern < ZPAR; kern++)
        {
          kernelCache = ((__global Dtype8*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[0];
          kernelCacheR = ((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[2];

          index = kern*XPAR;
          sum[index + 0] += dot(imageCache.S0123,kernelCache.S0123);
          sum[index + 1] += dot(imageCache.S4567,kernelCache.S0123);
          sum[index + 2] += dot(imageCache.S89AB,kernelCache.S0123);
          sum[index + 3] += dot(imageCache.SCDEF,kernelCache.S0123);

          sum[index + 0] += dot(imageCache.S4567,kernelCache.S4567);
          sum[index + 1] += dot(imageCache.S89AB,kernelCache.S4567);
          sum[index + 2] += dot(imageCache.SCDEF,kernelCache.S4567);
          sum[index + 3] += dot(imageCacheR.S0123,kernelCache.S4567);

          sum[index + 0] += dot(imageCache.S89A,kernelCacheR.S012);
          sum[index + 1] += dot(imageCache.SCDE,kernelCacheR.S012);
          sum[index + 2] += dot(imageCacheR.S012,kernelCacheR.S012);
          sum[index + 3] += dot(imageCacheR.S456,kernelCacheR.S012);
        }

        image_dataPtrFloat += WIDTH;
        kernel_dataPtrFloat += KERNEL_W;
      }
      image_dataPtrFloat += imageSize - WIDTH*KERNEL_H;
    }

    if(APPLY_BIAS == 1)
    {
      for(int_tp kern = 0; kern < ZPAR; kern++)
      {
        for(int_tp wi =0; wi < XPAR; wi++)
        if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W)
        convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX + wi] =
        sum[kern*XPAR + wi] + bias[biasIndex +kern];
      }
    }
    else
    for(int_tp kern = 0; kern < ZPAR; kern++)
    for(int_tp wi =0; wi < XPAR; wi++)
    if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W)
    convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX + wi] = sum[kern*XPAR + wi];
  }
 }

 #endif

 #ifdef MULTI_GEN
 __kernel void CFMulti_6(__global const Dtype* restrict image_data, const int_tp image_offset,
    __global const Dtype* restrict kernel_data, const int_tp kernel_offset,
    __global const Dtype* restrict bias,const int_tp bias_offset,
    __global Dtype* restrict convolved_image,const int_tp convolved_image_offset,
    const ushort WIDTH,
    const ushort HEIGHT,
    const ushort OUTPUT_W,
    const ushort OUTPUT_H) {

  const int_tp outputX = get_global_id(0)*XPAR;
  const int_tp outputY = get_global_id(1)*YPAR;
  const int_tp kernelNum = get_global_id(2)*ZPAR;

  if(outputX < OUTPUT_W && outputY < OUTPUT_H)
  {
    Dtype sum[XPAR*YPAR*ZPAR];
    for(uint_tp kern = 0; kern < XPAR*YPAR*ZPAR; kern++)
    sum[kern] = 0.0f;

    const int_tp currentKernelOffset = kernel_offset + kernelNum*KERNELSIZE*CHANNELS;
    const int_tp biasIndex=bias_offset + kernelNum;
    const int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W;
    const int_tp imageSize = WIDTH*HEIGHT;
    int_tp index;

    __global const Dtype* image_dataPtrFloat[2];
    image_dataPtrFloat[0] = (image_data + (image_offset + local_image_offset));
    image_dataPtrFloat[1] = image_dataPtrFloat[0];
    __global const Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));

    DTImage imageCache[YPAR];
    DTKernel kernelCache;
    Dtype4 temp;

    for(uint_tp c = 0; c < CHANNELS; c++)
    {
      imageCache[0] = ((__global DTImage*)image_dataPtrFloat[1])[0];
      for(uint_tp preload = 1; preload < YPAR; preload++)
      {
        image_dataPtrFloat[1] += WIDTH;
        imageCache[preload] = ((__global DTImage*)image_dataPtrFloat[1])[0];
      }

      int_tp y =0;
      LOOP(KERNEL_H, y,
          {
            int_tp kern=0;
            LOOP(ZPAR, kern,
                {
                  kernelCache = ((__global DTKernel*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[0];
                  index = kern*XPAR*YPAR;

                  for(uint_tp y_par = 0; y_par < YPAR; y_par++)
                  {
                    temp = floatDotV4(imageCache[y_par],kernelCache);
                    sum[index + y_par*XPAR + 0] += temp.s0;
                    sum[index + y_par*XPAR + 1] += temp.s1;
                    sum[index + y_par*XPAR + 2] += temp.s2;
                    sum[index + y_par*XPAR + 3] += temp.s3;
                  }
                });

            kernel_dataPtrFloat += KERNEL_W;

            for(uint_tp rotateData = 0; rotateData < YPAR - 1; rotateData++)
            imageCache[rotateData] = imageCache[rotateData + 1];

            image_dataPtrFloat[1] += WIDTH;
            imageCache[YPAR - 1] = ((__global DTImage*)image_dataPtrFloat[1])[0];
          });

      image_dataPtrFloat[0] += imageSize;
      image_dataPtrFloat[1] = image_dataPtrFloat[0];
    }

    if(APPLY_BIAS == 1)
    {
      for(uint_tp kern = 0; kern < ZPAR; kern++)
      {
        for(uint_tp hi =0; hi < YPAR; hi++)
        for(uint_tp wi =0; wi < XPAR; wi++)
        if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W && outputY + hi < OUTPUT_H)
        convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + (outputY +hi)*OUTPUT_W + outputX + wi] =
        sum[kern*XPAR*YPAR + XPAR*hi + wi] + bias[biasIndex +kern];
      }
    }
    else
    for(uint_tp kern = 0; kern < ZPAR; kern++)
    for(uint_tp hi =0; hi < YPAR; hi++)
    for(uint_tp wi =0; wi < XPAR; wi++)
    if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W && outputY + hi < OUTPUT_H)
    convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + (outputY + hi)*OUTPUT_W + outputX + wi] = sum[kern*XPAR*YPAR +XPAR*hi +wi];
  }
 }
 #endif

 //Begin IDLF kernels below here
 #ifdef IDLF

 #define activation_function(x) (x)

 #if 0
 #define _IW INPUT_WIDTH
 #define _IH INPUT_HEIGHT
 #define _OW OUTPUT_WIDTH
 #define _OH OUTPUT_HEIGHT
 #endif

 #define _ID INPUT_DEPTH
 #define _OD NUM_FILTERS

 #define FILTER_DEPTH INPUT_DEPTH
 #define NUM_INPUT INPUT_DEPTH
 #define NUM_OUTPUT NUM_FILTERS

 #define KERNEL FILTER_WIDTH
 // convolution stride, same for x and y
 #define K_STRIDE STRIDEX

 #ifndef IWPAD
 #define IWPAD 0
 #endif

 #ifndef IHPAD
 #define IHPAD 0
 #endif

 #define OUT_BLOCK_SIZE (OUT_BLOCK_WIDTH*OUT_BLOCK_HEIGHT)

 #ifndef MASTER_OUT_BLOCK_WIDTH
 #define MASTER_OUT_BLOCK_WIDTH OUT_BLOCK_WIDTH
 #endif
 #ifndef MASTER_OUT_BLOCK_HEIGHT
 #define MASTER_OUT_BLOCK_HEIGHT OUT_BLOCK_HEIGHT
 #endif

 // Each work-item computes a 4x6 region of one output map.
 // Each work-group (which will be mapped to 1 SIMD16 EU thread) will compute 16 different feature maps, but each feature map is for the same 4x6 region of the imput image.
 // NDRange:  (_OW+pad)/ OUT_BLOCK_WIDTH, (_OH+pad)/OUT_BLOCK_HEIGHT, _OD/OUT_BLOCK_DEPTH

 //#define SIMD_SIZE 16
 // NOTE: this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break.
 #ifdef SIMD16


 #define TILE_X ((OUT_BLOCK_WIDTH - 1) * STRIDEX + KERNEL)
 #define TILE_Y ((OUT_BLOCK_HEIGHT - 1) * STRIDEY + KERNEL)

 #if (TILE_X % 4) != 0
 __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
 kernel void
 convolve_simd16(  // __global float *inputs, __global float* weights, __global float* outputs
    __global float* inputs_base,
    filter_qualifier float* weights_base,
    __global float* biases_base,
    __global float* outputs_base,
    const ushort _IW,
    const ushort _IH,
    const ushort _OW,
    const ushort _OH)
 {
  __global float* outputs = outputs_base;
  __global float* inputs = inputs_base;
  filter_qualifier float* weights = weights_base;
  __global float* biases = biases_base;

  uint_tp oc = get_global_id(0) * MASTER_OUT_BLOCK_WIDTH;  // oc = Output Column
  uint_tp or = get_global_id(1) * MASTER_OUT_BLOCK_HEIGHT;// or = Output Row
  uint_tp fm = get_global_id(2);// fm = Feature Map = od = Output Depth
  uint_tp fmg = get_group_id(2);
  uint_tp lid = get_local_id(2);

  float in[IN_BUFFER_SIZE];// load 11x16 block of input data, really only need 11x15 for 4x6 outputs, but keep it simple.
  //float out[24]; // 4x6 block of outputs that is SIMD_SIZE deep (along the Feature Map dimension).
  float out[OUT_BLOCK_SIZE];

  uint_tp in_addr;

  // find weights adress of given neuron (lid is index)
  uint_tp weight_addr = (fmg % (_OD/SIMD_SIZE)) * INPUT_DEPTH * KERNEL * KERNEL * SIMD_SIZE + lid;

  for(int_tp i=0;i<OUT_BLOCK_SIZE;i++) {
    out[i]=0.0f;
  }

  uint_tp num_in_batch = fm / _OD;

  uint_tp input_batch_offset = num_in_batch * (_IH + IHPAD) * (_IW + IWPAD) * TOTAL_INPUT_DEPTH_SIZE;
  for(int_tp kd = 0; kd < _ID; kd++)
  {
    in_addr = input_batch_offset + (kd + INPUT_START_Z) * (_IH + IHPAD) * (_IW + IWPAD) + (or*K_STRIDE + INPUT_START_Y) * (_IW + IWPAD) + (oc*K_STRIDE + INPUT_START_X) + lid;

    // read 11x16 input block into registers.
    for(uint_tp reg = 0; reg < IN_BUFFER_SIZE; reg++) {
      in[reg] = inputs[in_addr];    // read 16 elements
      in_addr += (_IW + IWPAD);// move to next row down
    }

 // PREF could be 4 or 8, could not be other values.
 #define WEIGHT_PREF 8
    union {
      float w[WEIGHT_PREF];
      uint8 ui8;
    } weight_buf;
    int_tp w_idx=0;

    weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
    uint_tp orig_weight_addr = weight_addr;
    weight_addr += SIMD_SIZE * WEIGHT_PREF;

    int_tp kr = 0;  // kr = Kernel Row
    LOOP(KERNEL, kr,// LOOP is a macro that unrolls the loop.
        {
          int_tp kc = 0;  // kc = Kernel Column
          LOOP(KERNEL, kc,
              {
                for(int_tp br=0; br < OUT_BLOCK_HEIGHT; br++) {
                  for(int_tp bc=0; bc < OUT_BLOCK_WIDTH; bc++) {
                    float input = intel_sub_group_shuffle( in[br * K_STRIDE + kr], bc * K_STRIDE + kc);
                    out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
                  }
                }
                // We assume KERNEL_W is equal to KERNEL_H here.
                if ((w_idx + 1) % WEIGHT_PREF == 0
                    #if KERNEL*KERNEL % 8 != 0
                    && ((w_idx + 1) <= (KERNEL * KERNEL - WEIGHT_PREF))
                    #endif
                    ) {
                  weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
                  weight_addr += SIMD_SIZE * WEIGHT_PREF;  // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
                }
              #if KERNEL*KERNEL % 8 == 0
                // need to do nothing
              #else
                else if ((w_idx + 1) %  WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL * KERNEL - WEIGHT_PREF)))
                #if KERNEL*KERNEL % 8 == 1
                  weight_buf.w[0] = weights[weight_addr];
                #elif KERNEL*KERNEL % 4 == 0
                  weight_buf.ui8.s0123 = intel_sub_group_block_read4((__global uint *)&weights[weight_addr]);
                #else
                // should never be here if kernel_w equal to kernel_h. just in case.
                #error unsupported kernel size.
                #endif
              #endif
                ++w_idx;
              });
        });
    weight_addr = orig_weight_addr + KERNEL * KERNEL * SIMD_SIZE;

  }

 #ifdef IMAGE_AS_OUTPUT
  // TODO: no ULT for that one yet!
  uint_tp out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD)) * (_OW + OWPAD) * (_OH + OHPAD);// out_addr indexes into start of 16 feature maps.
 #else
  // we need this address calculation for outputs because we support views and batching
  uint_tp out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD) ) * (_OW + OWPAD) * (_OH + OHPAD);
 #endif

  out_addr += or * (_OW + OWPAD) + oc;  // offset for the 4x3 block that this workitem is working on;

  // we need this address calculation for biases because we support views and batching
  float bias = biases[(fm) % _OD ];
 #ifndef WRITE_PADDED_VALUES
  if(get_global_id(0) != (get_global_size(0)-1) &&
      get_global_id(1) != (get_global_size(1)-1) )
  {
 #endif
    for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
        // this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
 #ifdef IMAGE_AS_OUTPUT
        write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
 #else
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
 #endif
      }
    }
 #ifndef WRITE_PADDED_VALUES
  } else if ( get_global_id(1) != (get_global_size(1)-1) )
  {
    for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
 #ifdef IMAGE_AS_OUTPUT
        write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
 #else
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
 #endif
      }
    }
  }
  else if ( get_global_id(0) != (get_global_size(0)-1) )
  {
    for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
 #ifdef IMAGE_AS_OUTPUT
        write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
 #else
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
 #endif
      }
    }
  }
  else
  {
    for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
 #ifdef IMAGE_AS_OUTPUT
        write_imagef(outputs,(int2)(c,r*(_OW + OWPAD)),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
 #else
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
 #endif
      }
    }
  }
 #endif //#ifndef WRITE_PADDED_VALUES
 }
 #endif

 #if TILE_X % 4 == 0
 #define TILE_Y_STRIDE (64 / TILE_X)
 #define INVEC_NUM ((TILE_Y + TILE_Y_STRIDE - 1) / TILE_Y_STRIDE)
 __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
 kernel void
 convolve_simd16(  // __global float *inputs, __global float* weights, __global float* outputs
    __global float* inputs_base,
    filter_qualifier float* weights_base,
    __global float* biases_base,
    __global float* outputs_base,
    const ushort _IW,
    const ushort _IH,
    const ushort _OW,
    const ushort _OH)
 {
  __global float* outputs = outputs_base;
  __global float* inputs = inputs_base;
  filter_qualifier float* weights = weights_base;
  __global float* biases = biases_base;

  uint_tp oc = get_global_id(0) * MASTER_OUT_BLOCK_WIDTH;  // oc = Output Column
  uint_tp or = get_global_id(1) * MASTER_OUT_BLOCK_HEIGHT;// or = Output Row
  uint_tp fm = get_global_id(2);// fm = Feature Map = od = Output Depth
  uint_tp fmg = get_group_id(2);
  uint_tp lid = get_local_id(2);

  float out[OUT_BLOCK_SIZE];

  uint_tp in_addr;

  // find weights adress of given neuron (lid is index)
  uint_tp weight_addr = (fmg % (_OD/SIMD_SIZE)) * INPUT_DEPTH * KERNEL * KERNEL * SIMD_SIZE + lid;

  for(int_tp i=0;i<OUT_BLOCK_SIZE;i++) {
    out[i]=0.0f;
  }

  uint_tp num_in_batch = ( fm ) / _OD;

  uint_tp input_batch_offset = num_in_batch * (_IH + IHPAD) * (_IW + IWPAD) * TOTAL_INPUT_DEPTH_SIZE;

  in_addr = input_batch_offset + INPUT_START_Z * (_IH + IHPAD) * (_IW + IWPAD) + (or*STRIDEY + INPUT_START_Y) * (_IW + IWPAD) + (oc*STRIDEX + INPUT_START_X)
            + ( lid / ( TILE_X / 4 ) ) * (_IW + IWPAD) * STRIDEY             // y tile offset
            + ( lid % ( TILE_X / 4 ) ) * 4 * STRIDEX;                        // x tile offset

  for(int_tp kd = 0; kd < _ID; kd++)
  {
    union {
      float4 in_vec[INVEC_NUM];
      float in_array[INVEC_NUM * 4];
    } in_buf;
    uint_tp in_offset = in_addr;
    int_tp reg = 0;
 #if INVEC_NUM == 1
    LOOP(1, reg,
 #elif INVEC_NUM == 2
    LOOP(2, reg,
 #elif INVEC_NUM == 3
    LOOP(3, reg,
 #elif INVEC_NUM == 4
    LOOP(4, reg,
 #else
    #error too large invec_num.
 #endif
      {
        in_buf.in_vec[reg] = *(global float4*)(inputs + in_offset);    // read 16 elements
        in_offset += (_IW + IWPAD) * TILE_Y_STRIDE;
      });
    in_addr += (_IH + IHPAD) * (_IW + IWPAD);
    
 // PREF could be 4 or 8, could not be other values.
 #define WEIGHT_PREF 8
    union {
      float w[WEIGHT_PREF];
      uint8 ui8;
    } weight_buf;
    int_tp w_idx=0;

    weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
    uint_tp orig_weight_addr = weight_addr;
    weight_addr += SIMD_SIZE * WEIGHT_PREF;

 #define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4))

    int_tp kr = 0;  // kr = Kernel Row
    LOOP(KERNEL, kr,// LOOP is a macro that unrolls the loop.
        {
          int_tp kc = 0;  // kc = Kernel Column
          LOOP(KERNEL, kc,
              {
                for(int_tp br=0; br < OUT_BLOCK_HEIGHT; br++) {
                  for(int_tp bc=0; bc < OUT_BLOCK_WIDTH; bc++) {
                    float input = BLOCK_IN((br * STRIDEY + kr) * TILE_X + bc * STRIDEX + kc);//intel_sub_group_shuffle( in[br * K_STRIDE + kr], bc * K_STRIDE + kc);
                    out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
                  }
                }
                // We assume KERNEL_W is equal to KERNEL_H here.
                if ((w_idx + 1) % WEIGHT_PREF == 0
                #if KERNEL*KERNEL % 8 != 0
                && ((w_idx + 1) <= (KERNEL * KERNEL - WEIGHT_PREF))
                #endif
                    ) {
                  weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
                  weight_addr += SIMD_SIZE * WEIGHT_PREF;  // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
                }
              #if KERNEL*KERNEL % 8 == 0
                // need to do nothing
              #else
                else if ((w_idx + 1) %  WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL * KERNEL - WEIGHT_PREF)))
                #if KERNEL*KERNEL % 8 == 1
                  weight_buf.w[0] = weights[weight_addr];
                #elif KERNEL*KERNEL % 4 == 0
                  weight_buf.ui8.s0123 = intel_sub_group_block_read4((__global uint *)&weights[weight_addr]);
                #else
                // should never be here if kernel_w equal to kernel_h. just in case.
                #error unsupported kernel size.
                #endif
              #endif
                ++w_idx;
              });
        });
    weight_addr = orig_weight_addr + KERNEL * KERNEL * SIMD_SIZE;

  }

  // we need this address calculation for outputs because we support views and batching
  uint_tp out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD) ) * (_OW + OWPAD) * (_OH + OHPAD);

  out_addr += or * (_OW + OWPAD) + oc;  // offset for the 4x3 block that this workitem is working on;

  // we need this address calculation for biases because we support views and batching
  float bias = biases[(fm) % _OD ];
 #ifndef WRITE_PADDED_VALUES
  if(get_global_id(0) != (get_global_size(0)-1) &&
      get_global_id(1) != (get_global_size(1)-1) )
  {
 #endif
    for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
        // this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
      }
    }
 #ifndef WRITE_PADDED_VALUES
  } else if ( get_global_id(1) != (get_global_size(1)-1) )
  {
    for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
      }
    }
  }
  else if ( get_global_id(0) != (get_global_size(0)-1) )
  {
    for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
      }
    }
  }
  else
  {
    for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
      }
    }
  }
 #endif //#ifndef WRITE_PADDED_VALUES
 }
 #endif // Stride > 2
 #endif

 #endif

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(copyImage, Dtype)
    (__global Dtype* image_data,
     int_tp image_offset,
     const int_tp channels, const int_tp height, const int_tp width,
     const int_tp adjustedHeight, const int_tp adjustedWidth,
     const int_tp pad_h, const int_tp pad_w,
     __global Dtype* output_image,
     const int_tp output_offset,
     const int_tp batch_size) {

  uint_tp sX = get_global_id(0);
  uint_tp sY = get_global_id(1);
  uint_tp sZ = get_global_id(2);

  int_tp in_y = sY - pad_h;
  int_tp in_x = sX - pad_w;

  int_tp batch_offset = 0;
  int_tp adjusted_batch_offset = 0;
  for(uint_tp batch_idx = 0; batch_idx < batch_size; batch_idx++) {
    int_tp dst_offset = adjusted_batch_offset + output_offset + sZ*adjustedHeight*adjustedWidth + sY*adjustedWidth +sX;
    int_tp src_offset = batch_offset + image_offset + sZ*height*width + in_y*width + in_x;
    if((in_y >= 0 && in_y < height && in_x >= 0 && in_x < width))
      output_image[dst_offset] = image_data[src_offset];
    else
      output_image[dst_offset] = 0;
    batch_offset += height * width * channels;
    adjusted_batch_offset += adjustedHeight * adjustedWidth * channels;
  }
 }

 __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
    (__global Dtype* weightIn,
     __global Dtype* weightOut,
     const int_tp kernel_w,
     const int_tp kernel_h,
     const int_tp channels,
     const int_tp outputs,
     const int_tp swizzleFactor) {

  uint_tp sX = get_global_id(0);

  //Original location

  //Output location
  int_tp outputSublayer = channels / swizzleFactor;
  int_tp outputSublayerIndex = channels % swizzleFactor;

  int_tp filter = sX / (kernel_w*kernel_h*channels);
  int_tp kernel_X = sX % kernel_w;
  int_tp kernel_Y = (sX / kernel_w) % kernel_h;
  int_tp kernel_C = (sX / (kernel_w * kernel_h)) % channels;

  int_tp FP = filter / swizzleFactor;
  int_tp F1 = filter % swizzleFactor;

  weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1]
  = weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X];
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(crop_copy, Dtype)(const int_tp n, const int_tp height,
                                         const int_tp width,
                                         const int_tp src_outer_stride,
                                         const int_tp src_inner_stride,
                                         const int_tp dest_outer_stride,
                                         const int_tp dest_inner_stride,
                                         __global const Dtype* src,
                                         const int_tp src_off,
                                         __global Dtype* dest,
                                         const int_tp dest_off) {
  for (int_tp index = get_global_id(0); index < n;
      index += get_global_size(0)) {
    int_tp src_start = index / height * src_outer_stride
        + index % height * src_inner_stride;
    int_tp dest_start = index / height * dest_outer_stride
        + index % height * dest_inner_stride;
    for (int_tp i = 0; i < width; ++i) {
      dest[dest_off + dest_start + i] = src[src_off + src_start + i];
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(dropout_forward,Dtype)(const int_tp n,
                                              __global const Dtype* in,
                                              __global const uint_tp* mask,
                                              const uint_tp threshold,
                                              const Dtype scale,
                                              __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out[index] = in[index] * ((mask[index] > threshold)?1.0:0.0) * scale;
  }
 }

 __kernel void TEMPLATE(dropout_backward,Dtype)(
    const int_tp n, __global const Dtype* in_diff,
    __global const uint_tp* mask, const uint_tp threshold,
    const Dtype scale,
    __global Dtype* out_diff) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out_diff[index] = in_diff[index] * ((mask[index] > threshold)?1.0:0.0) * scale;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(eltwise_max_forward,Dtype)(
    const int_tp nthreads, __global const Dtype* bottom_data_a,
    __global const Dtype* bottom_data_b, const int_tp blob_idx,
    __global Dtype* top_data,
    __global int_tp* mask) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    Dtype maxval = -FLT_MAX;
    int_tp maxidx = -1;
    if (bottom_data_a[index] > bottom_data_b[index]) {
      // only update for very first bottom_data blob (blob_idx == 0)
      if (blob_idx == 0) {
        maxval = bottom_data_a[index];
        top_data[index] = maxval;
        maxidx = blob_idx;
        mask[index] = maxidx;
      }
    } else {
      maxval = bottom_data_b[index];
      top_data[index] = maxval;
      maxidx = blob_idx + 1;
      mask[index] = maxidx;
    }
  }
 }

 __kernel void TEMPLATE(eltwise_max_backward,Dtype)(const int_tp nthreads,
                                                   __global const Dtype* top_diff,
                                                   const int_tp blob_idx,
                                                   __global const int_tp* mask,
                                                   __global Dtype* bottom_diff) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    Dtype gradient = 0;
    if (mask[index] == blob_idx) {
      gradient += top_diff[index];
    }
    bottom_diff[index] = gradient;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(elu_forward,Dtype)(const int n, __global const Dtype* in,
                                          __global Dtype* out,
                                          Dtype alpha) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out[index] = in[index] > 0 ? in[index] : alpha * (exp(in[index]) - 1.0);
  }
 }

 __kernel void TEMPLATE(elu_backward,Dtype)(const int n, __global const Dtype* in_diff,
                                           __global const Dtype* out_data,
                                           __global const Dtype* in_data,
                                           __global Dtype* out_diff,
                                           Dtype alpha) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out_diff[index] =
        in_data[index] > 0 ?
            in_diff[index] : in_diff[index] * (out_data[index] + alpha);
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(embed_forward,Dtype)(const int_tp nthreads,
                                            __global const Dtype* bottom_data,
                                            __global const Dtype* weight,
                                            const int_tp M, const int_tp N,
                                            const int_tp K,
                                            __global Dtype* top_data) {
  for (int_tp top_index = get_global_id(0); top_index < nthreads;
      top_index += get_global_size(0)) {
      const int_tp n = top_index / N;
      const int_tp d = top_index % N;
      const int_tp index = (int_tp)(bottom_data[n]);
      const int_tp weight_index = index * N + d;
      top_data[top_index] = weight[weight_index];
    }
  }

 // atomic_add from: http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html
 #if (TYPE == TYPE_FLOAT)
 inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) {
    union {
        uint_tp intVal;
        Dtype floatVal;
    } newVal;
    union {
        uint_tp intVal;
        Dtype floatVal;
    } prevVal;
    do {
        prevVal.floatVal = *source;
        newVal.floatVal = prevVal.floatVal + operand;
    } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
 }

 __kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data,
    __global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K,
    __global Dtype* weight_diff) {
  for (int_tp top_index = get_global_id(0); top_index < nthreads;
      top_index += get_global_size(0)) {
    const int_tp n = top_index / N;
    const int_tp d = top_index % N;
    const int_tp index = (int_tp)(bottom_data[n]);
    const int_tp weight_index = index * N + d;

    TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index));
  }
 }
 #endif

 #if (TYPE == TYPE_DOUBLE)
 #ifdef ATOMICS_64_AVAILABLE
 inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) {
    union {
        unsigned long intVal;
        Dtype floatVal;
    } newVal;
    union {
        unsigned long intVal;
        Dtype floatVal;
    } prevVal;
    do {
        prevVal.floatVal = *source;
        newVal.floatVal = prevVal.floatVal + operand;
    } while (atom_cmpxchg((volatile __global unsigned long *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
 }

 __kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data,
    __global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K,
    __global Dtype* weight_diff) {
  for (int_tp top_index = get_global_id(0); top_index < nthreads;
      top_index += get_global_size(0)) {
    const int_tp n = top_index / N;
    const int_tp d = top_index % N;
    const int_tp index = (int_tp)(bottom_data[n]);
    const int_tp weight_index = index * N + d;

    TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index));
  }
 }
 #endif
 #endif

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(fft_phony,Dtype)(Dtype arg) {
  Dtype out = arg;
 }

 #ifdef FFT
 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif
 #define DtypeComplex Dtype2

 __kernel void TEMPLATE(copy2buffer_cyclic_shift_in,Dtype)(
    __global Dtype* fft_gpu_weights_real, const int_tp offset_fft_gpu_weights_real,
    __global Dtype* weight, const int_tp offset_weight,
    const int_tp ker_size, const int_tp ch_gr, const int_tp ker_size_ch_gr,
    const int_tp ker_w, const int_tp ker_c_h, const int_tp ker_c_w, 
    const int_tp fft_height, const int_tp fft_width, const int_tp complex_w_len) {
  fft_gpu_weights_real += offset_fft_gpu_weights_real;
  weight += offset_weight;
  int_tp gId = get_global_id(0);
  int_tp out = gId / ker_size_ch_gr;
  int_tp c = (gId - out * ker_size_ch_gr) / ker_size;
  int_tp map_offset = out * ch_gr + c;
  int_tp map_offset_ker_size = map_offset * ker_size;
  int_tp pos_in_map = gId - map_offset_ker_size;
  int_tp h = pos_in_map / ker_w;
  int_tp h_ker_w = h * ker_w;
  int_tp w = pos_in_map - h_ker_w;
  int_tp src_idx = map_offset_ker_size + h_ker_w + w;
  int_tp ky = h - ker_c_h;
  if (ky < 0) ky += fft_height;
  int_tp kx = w - ker_c_w;
  if (kx < 0) kx += fft_width;
  int_tp dst_idx = (map_offset * fft_height + ky) * complex_w_len + kx;
  fft_gpu_weights_real[dst_idx] = weight[src_idx];
 }

 /* Use when width < 4 */
 __kernel void TEMPLATE(copy2buffer_left_top_in_naive,Dtype)(__global Dtype* map_out,
    const int_tp offset_map_out, 
    const __global Dtype* map_in, const int_tp offset_map_in, 
    const int_tp size, 
    const int_tp height_out, const int_tp width_out, 
    const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w,
    const int_tp pad_h, const int_tp pad_w) {
  map_out += offset_map_out;
  map_in  += offset_map_in;
  int_tp gId = get_global_id(0);
  int_tp h = gId / width;
  int_tp w = gId - (h * width);
  int_tp dst_idx = (h*stride_h + pad_h)*width_out + (w*stride_w + pad_w);
  map_out[dst_idx] = map_in[gId];
 }

 /* Use when width < 4 */
 __kernel void TEMPLATE(copy2buffer_left_top_in_naive_2d,Dtype)(__global Dtype* map_out,
    const int_tp offset_map_out, 
    const __global Dtype* map_in, const int_tp offset_map_in, 
    const int_tp map_out_size, const int_tp size, const int_tp count,
    const int_tp height_out, const int_tp width_out, 
    const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w,
    const int_tp pad_h, const int_tp pad_w) {
  map_out += offset_map_out;
  map_in  += offset_map_in;
  int_tp gId_x = get_global_id(0);
  int_tp gId_y = get_global_id(1); 
  int_tp h = gId_x / width;
  int_tp w = gId_x - (h * width);
  int_tp src_idx = gId_y * size + gId_x;
  int_tp dst_idx = gId_y * map_out_size + 
      (h * stride_h + pad_h) * width_out + (w * stride_w + pad_w);
  map_out[dst_idx] = map_in[src_idx];
 }

 /* Use when width >= 4 */
 __kernel void TEMPLATE(copy2buffer_left_top_in,Dtype)(__global Dtype* map_out,
    const int_tp offset_map_out,
    const __global Dtype* map_in, const int_tp offset_map_in,
    const int_tp size,
    const int_tp height_out, const int_tp width_out, 
    const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w,
    const int_tp pad_h, const int_tp pad_w) {
  map_out += offset_map_out;
  map_in  += offset_map_in;
  int_tp gId = get_global_id(0);
  int_tp count = size >> 2;
  int_tp gId4 = gId << 2;
  int_tp h = gId4 / width;
  int_tp w = gId4 - (h * width);
  int_tp dst_h = h*stride_h + pad_h;
  int_tp dst_w = w*stride_w + pad_w;
  int_tp dst_idx = dst_h*width_out + dst_w;
  if (gId < count) {
    Dtype4 map_in_cache4 = vload4(gId, map_in);
    int_tp has_pad = width - dst_w; 
    if (has_pad >= 4) {
      vstore4(map_in_cache4, dst_idx >> 2, map_out);
    } else { 
      if (0 == has_pad) {
        dst_idx += width_out + pad_w - dst_w;
      }
      map_out[dst_idx] = map_in_cache4.x;
      if (1 == has_pad) {
        dst_idx += width_out + pad_w - dst_w - 1;
      }
      map_out[dst_idx+1] = map_in_cache4.y;
      if (2 == has_pad) {
        dst_idx += width_out + pad_w - dst_w - 2;
      }
      map_out[dst_idx+2] = map_in_cache4.z;
      if (3 == has_pad) {
        dst_idx += width_out + pad_w - dst_w - 3;
      }
      map_out[dst_idx+3] = map_in_cache4.w;
      dst_h += 1;
      dst_w = pad_w;
    }
  } else if (gId == count) {
    int_tp res = size - (count << 2); /* size % 4 */
    if (res > 0) {
      Dtype4 map_in_cache4 = 0.f;
      if (res >= 1) 
        map_in_cache4.x = map_in[gId4];
      if (res >= 2)
        map_in_cache4.y = map_in[gId4+1];
      if (res == 3)
        map_in_cache4.z = map_in[gId4+2];
      int_tp has_pad = width - dst_w; 
      if (has_pad >= 4) {
        vstore4(map_in_cache4, dst_idx >> 2, map_out);
      } else { 
        if (0 == has_pad) {
          dst_idx += width_out + pad_w - dst_w;
        }
        map_out[dst_idx] = map_in_cache4.x;
        if (1 == has_pad) {
          dst_idx += width_out + pad_w - dst_w - 1;
        }
        map_out[dst_idx+1] = map_in_cache4.y;
        if (2 == has_pad) {
          dst_idx += width_out + pad_w - dst_w - 2;
        }
        map_out[dst_idx+2] = map_in_cache4.z;
        if (3 == has_pad) {
          dst_idx += width_out + pad_w - dst_w - 3;
        }
        map_out[dst_idx+3] = map_in_cache4.w;
        dst_h += 1;
        dst_w = pad_w;
      }
    }
  }
 }

 /* Use when width >= 4 */
 __kernel void TEMPLATE(copy2buffer_left_top_in_2d,Dtype)(__global Dtype* map_out,
    const int_tp offset_map_out,
    const __global Dtype* map_in, const int_tp offset_map_in,
    const int_tp map_out_size, const int_tp size, const int_tp count,
    const int_tp height_out, const int_tp width_out, 
    const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w,
    const int_tp pad_h, const int_tp pad_w) {
  map_out += offset_map_out;
  map_in  += offset_map_in;
  int_tp gId = get_global_id(0);
  int_tp gId_y = get_global_id(1);
  int_tp gId4 = gId << 2;
  int_tp h = gId4 / width;
  int_tp w = gId4 - (h * width);
  int_tp dst_h = h*stride_h + pad_h;
  int_tp dst_w = w*stride_w + pad_w;
  int_tp dst_idx = dst_h*width_out + dst_w;
  const __global Dtype* map_in_2d = map_in + gId_y * size;
  __global Dtype* map_out_2d = map_out + gId_y * map_out_size;
  if (gId < count) {
    Dtype4 map_in_cache4 = vload4(gId, map_in_2d);
    int_tp has_pad = width - dst_w; 
    if (has_pad >= 4) {
      vstore4(map_in_cache4, dst_idx >> 2, map_out_2d);
    } else { 
      if (0 == has_pad) {
        dst_idx += width_out + pad_w - dst_w;
      }
      map_out_2d[dst_idx] = map_in_cache4.x;
      if (1 == has_pad) {
        dst_idx += width_out + pad_w - dst_w - 1;
      }
      map_out_2d[dst_idx+1] = map_in_cache4.y;
      if (2 == has_pad) {
        dst_idx += width_out + pad_w - dst_w - 2;
      }
      map_out_2d[dst_idx+2] = map_in_cache4.z;
      if (3 == has_pad) {
        dst_idx += width_out + pad_w - dst_w - 3;
      }
      map_out_2d[dst_idx+3] = map_in_cache4.w;
      dst_h += 1;
      dst_w = pad_w;
    }
  } else if (gId == count) {
    int_tp res = size - (count << 2); /* size % 4 */
    if (res > 0) {
      Dtype4 map_in_cache4 = 0.f;
      if (res >= 1) 
        map_in_cache4.x = map_in_2d[gId4];
      if (res >= 2)
        map_in_cache4.y = map_in_2d[gId4+1];
      if (res == 3)
        map_in_cache4.z = map_in_2d[gId4+2];
      int_tp has_pad = width - dst_w; 
      if (has_pad >= 4) {
        vstore4(map_in_cache4, dst_idx >> 2, map_out_2d);
      } else { 
        if (0 == has_pad) {
          dst_idx += width_out + pad_w - dst_w;
        }
        map_out_2d[dst_idx] = map_in_cache4.x;
        if (1 == has_pad) {
          dst_idx += width_out + pad_w - dst_w - 1;
        }
        map_out_2d[dst_idx+1] = map_in_cache4.y;
        if (2 == has_pad) {
          dst_idx += width_out + pad_w - dst_w - 2;
        }
        map_out_2d[dst_idx+2] = map_in_cache4.z;
        if (3 == has_pad) {
          dst_idx += width_out + pad_w - dst_w - 3;
        }
        map_out_2d[dst_idx+3] = map_in_cache4.w;
        dst_h += 1;
        dst_w = pad_w;
      }
    }
  }
 }

 /* Use when width_out < 4 */
 __kernel void TEMPLATE(copy2buffer_left_top_out_naive,Dtype)(__global Dtype* map_out,
    const int_tp offset_map_out, 
    const __global Dtype* map_in, const int_tp offset_map_in, 
    const int_tp size,
    const int_tp height_out, const int_tp width_out, 
    const int_tp fft_height, const int_tp fft_width, 
    const int_tp ker_center_h, const int_tp ker_center_w,
    const int_tp stride_h, const int_tp stride_w, 
    const int_tp pad_h, const int_tp pad_w) {
  map_out += offset_map_out;
  map_in += offset_map_in;
  int_tp gId = get_global_id(0);
  int_tp h_out = gId / width_out;
  int_tp w_out = gId - (h_out * width_out);
  int_tp h = h_out * stride_h + ker_center_h;
  int_tp w = w_out * stride_w + ker_center_w;
  int_tp src_idx = h*fft_width + w;
  map_out[gId] = map_in[src_idx];
 }

 /* Use when width_out < 4 */
 __kernel void TEMPLATE(copy2buffer_left_top_out_naive_2d,Dtype)(__global Dtype* map_out,
    const int_tp offset_map_out, 
    const __global Dtype* map_in, const int_tp offset_map_in,
    const int_tp size, const int_tp count, const int_tp map_in_size,
    const int_tp height_out, const int_tp width_out, 
    const int_tp fft_height, const int_tp fft_width, 
    const int_tp ker_center_h, const int_tp ker_center_w,
    const int_tp stride_h, const int_tp stride_w, 
    const int_tp pad_h, const int_tp pad_w) {
  map_out += offset_map_out;
  map_in += offset_map_in;
  int_tp gId = get_global_id(0);
  int_tp out = get_global_id(1);
  int_tp h_out = gId / width_out;
  int_tp w_out = gId - (h_out * width_out);
  int_tp h = h_out * stride_h + ker_center_h;
  int_tp w = w_out * stride_w + ker_center_w;
  int_tp src_idx = out * map_in_size + h*fft_width + w;
  int_tp dst_idx = out * size + gId;
  map_out[dst_idx] = map_in[src_idx];
 }

 /* Use when width_out >= 4 */
 __kernel void TEMPLATE(copy2buffer_left_top_out,Dtype)(__global Dtype* map_out,
    const int_tp offset_map_out,
    const __global Dtype* map_in, const int_tp offset_map_in, 
    const int_tp size,
    const int_tp height_out, const int_tp width_out, 
    const int_tp fft_height, const int_tp fft_width, 
    const int_tp ker_c_h, const int_tp ker_c_w,
    const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) {
  map_out += offset_map_out;
  map_in  += offset_map_in;
  int_tp gId = get_global_id(0);
  int_tp count = size >> 2;
  int_tp gId4 = gId << 2;
  int_tp h_out = gId4 / width_out;
  int_tp w_out = gId4 - (h_out * width_out);
  int_tp h = h_out * stride_h + ker_c_h;
  int_tp w = w_out * stride_w + ker_c_w;
  int_tp src_idx = h*fft_width + w;
  if (gId < count) {
    Dtype4 map_in_cache4;
    int_tp has_pad = width_out - (w - pad_w); 
    if (has_pad >= 4) {
      map_in_cache4 = vload4(src_idx >> 2, map_in);
    } else {
      int_tp right_elements = fft_width - width_out;
      if (0 == has_pad) {
        src_idx += right_elements;
      }
      map_in_cache4.x = map_in[src_idx];
      if (1 == has_pad) {
        src_idx += right_elements;
      }
      map_in_cache4.y = map_in[src_idx+1];
      if (2 == has_pad) {
        src_idx += right_elements;
      }
      map_in_cache4.z = map_in[src_idx+2];
      if (3 == has_pad) {
        src_idx += right_elements;
      }
      map_in_cache4.w = map_in[src_idx+3];
    }
    vstore4(map_in_cache4, gId, map_out);
  } else if (gId == count) {
    int_tp res = size - (count << 2); /* size % 4 */
    if (res > 0) {
      for (int_tp i = gId4; i < size; ++i) {
        map_out[i] = map_in[src_idx];
        src_idx++;
      }
    }
  }
 }

 /* Use when width_out >= 4 */
 __kernel void TEMPLATE(copy2buffer_left_top_out_2d,Dtype)(__global Dtype* map_out,
    const int_tp offset_map_out,
    const __global Dtype* map_in, const int_tp offset_map_in, 
    const int_tp size, const int_tp count, const int_tp map_in_size,
    const int_tp height_out, const int_tp width_out, 
    const int_tp fft_height, const int_tp fft_width, 
    const int_tp ker_c_h, const int_tp ker_c_w,
    const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) {
  map_out += offset_map_out;
  map_in  += offset_map_in;
  int_tp gId = get_global_id(0);
  int_tp out = get_global_id(1);
  int_tp gId4 = gId << 2;
  int_tp h_out = gId4 / width_out;
  int_tp w_out = gId4 - (h_out * width_out);
  int_tp h = h_out * stride_h + ker_c_h;
  int_tp w = w_out * stride_w + ker_c_w;
  int_tp src_idx = h*fft_width + w;
  const __global Dtype* map_in_2d = map_in + out * map_in_size;
  __global Dtype* map_out_2d = map_out + out * size;
  if (gId < count) {
    Dtype4 map_in_cache4;
    int_tp has_pad = width_out - (w - pad_w); 
    if (has_pad >= 4) {
      map_in_cache4 = vload4(src_idx >> 2, map_in_2d);
    } else {
      int_tp right_elements = fft_width - width_out;
      if (0 == has_pad) {
        src_idx += right_elements;
      }
      map_in_cache4.x = map_in_2d[src_idx];
      if (1 == has_pad) {
        src_idx += right_elements;
      }
      map_in_cache4.y = map_in_2d[src_idx+1];
      if (2 == has_pad) {
        src_idx += right_elements;
      }
      map_in_cache4.z = map_in_2d[src_idx+2];
      if (3 == has_pad) {
        src_idx += right_elements;
      }
      map_in_cache4.w = map_in_2d[src_idx+3];
    }
    vstore4(map_in_cache4, gId, map_out_2d);
  } else if (gId == count) {
    int_tp res = size - (count << 2); /* size % 4 */
    if (res > 0) {
      const __global Dtype4* map_in_2d_4 =
            (const __global Dtype4*)(map_in_2d + src_idx);
      __global Dtype4* map_out_2d_4 = (__global Dtype4*)(map_out_2d + gId4);
      if (res == 3) {
        map_out_2d_4[0].xyz = map_in_2d_4[0].xyz;
      } else if (res == 2) {
        map_out_2d_4[0].xy = map_in_2d_4[0].xy;
      } else if (res == 1) {
        map_out_2d_4[0].x = map_in_2d_4[0].x;
      }
    }
  }
 }

 __kernel void TEMPLATE(copy2buffer_cyclic_shift_out,Dtype)(__global Dtype* map_out,
    const int_tp offset_map_out, 
    const __global Dtype* map_in, const int_tp offset_map_in, 
    const int_tp width_out, 
    const int_tp fft_height, const int_tp fft_width, 
    const int_tp ker_center_h, const int_tp ker_center_w,
    const int_tp stride_h, const int_tp stride_w, 
    const int_tp pad_h, const int_tp pad_w) {
  map_out += offset_map_out;
  map_in  += offset_map_in;
  int_tp gId = get_global_id(0);
  int_tp h_out = gId / width_out;
  int_tp w_out = gId - (h_out * width_out);
  int_tp h = h_out * stride_h + pad_h;
  int_tp w = w_out * stride_w + pad_w;
  int_tp ky = h - ker_center_h;
  if (ky < 0) ky += fft_height;
  int_tp kx = w - ker_center_w;
  if (kx < 0) kx += fft_width;
  int_tp src_idx = ky*fft_width + kx;
  map_out[gId] = map_in[src_idx];
 }

 __kernel void TEMPLATE(copy2buffer_cyclic_shift_out_2d,Dtype)(__global Dtype* map_out,
    const int_tp offset_map_out, 
    const __global Dtype* map_in, const int_tp offset_map_in,
    const int_tp map_out_size, const int_tp map_in_size, 
    const int_tp width_out, 
    const int_tp fft_height, const int_tp fft_width, 
    const int_tp ker_center_h, const int_tp ker_center_w,
    const int_tp stride_h, const int_tp stride_w, 
    const int_tp pad_h, const int_tp pad_w) {
  map_out += offset_map_out;
  map_in  += offset_map_in;
  int_tp gId = get_global_id(0);
  int_tp gId_y = get_global_id(1);
  int_tp h_out = gId / width_out;
  int_tp w_out = gId - (h_out * width_out);
  int_tp h = h_out * stride_h + pad_h;
  int_tp w = w_out * stride_w + pad_w;
  int_tp ky = h - ker_center_h;
  if (ky < 0) ky += fft_height;
  int_tp kx = w - ker_center_w;
  if (kx < 0) kx += fft_width;
  int_tp src_idx = gId_y * map_in_size + ky*fft_width + kx;
  int_tp dst_idx = gId_y * map_out_size + gId;
  map_out[dst_idx] = map_in[src_idx];
 }

 __kernel void TEMPLATE(complex_conjugate_multiplication_1d,Dtype)(__global Dtype* dst,
    const int_tp offset_dst, 
    const __global Dtype* src1, const int_tp offset_src1,
    const __global Dtype* src2, const int_tp offset_src2, 
    const int_tp ch_gr) {
  dst += offset_dst;
  src1 += offset_src1;
  src2 += offset_src2;
  int_tp gId = get_global_id(0); 
  int_tp size = get_global_size(0);
  Dtype4 dst_cache = 0.f;
  int_tp src_idx;
  Dtype4 s1_cache;
  Dtype4 s2_cache;
  for (int_tp c = 0; c < ch_gr; ++c) {
    src_idx = size * c + gId;
    s1_cache = vload4(src_idx, src1);
    s2_cache = vload4(src_idx, src2);
    dst_cache.x +=  s1_cache.x * s2_cache.x + s1_cache.y * s2_cache.y;
    dst_cache.y += -s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x;
    dst_cache.z +=  s1_cache.z * s2_cache.z + s1_cache.w * s2_cache.w;
    dst_cache.w += -s1_cache.z * s2_cache.w + s1_cache.w * s2_cache.z;
  }
  ((__global Dtype4*)(&dst[gId<<2]))[0] += dst_cache; 
 }

 __kernel void TEMPLATE(complex_conjugate_multiplication_2d,Dtype)(__global Dtype* dst,
    const int_tp offset_dst, 
    const __global Dtype* src1, const int_tp offset_src1, 
    const __global Dtype* src2, const int_tp offset_src2,
    const int_tp out_gr, const int_tp map_size, const int_tp ch_gr) {
  dst += offset_dst;
  src1 += offset_src1;
  src2 += offset_src2;
  int_tp gId = get_global_id(0);
  int_tp out = get_global_id(1);
  int_tp src1_idx, src2_idx;
  int_tp dst_map_offset = map_size * out;
  int_tp dst_idx = dst_map_offset + gId;
  Dtype4 s1_cache, s2_cache;
  Dtype4 dst_cache = 0.f;
  int_tp map_offset = dst_map_offset * ch_gr;
  for (int_tp i = 0; i < ch_gr; ++i) {
    src1_idx = map_size * i + gId;
    src2_idx = map_offset + src1_idx;
    s1_cache = vload4(src1_idx, src1);
    s2_cache = vload4(src2_idx, src2);
    dst_cache.xz += mad( s1_cache.xz, s2_cache.xz, s1_cache.yw * s2_cache.yw);
    dst_cache.yw += mad(-s1_cache.xz, s2_cache.yw, s1_cache.yw * s2_cache.xz);
  }
  vstore4(dst_cache, dst_idx, dst);
 }

 __kernel void TEMPLATE(complex_conjugate_multiplication_2d_SLM,Dtype)(
    __global Dtype* restrict dst, const int_tp offset_dst,
    const __global Dtype* restrict src1, const int_tp offset_src1, 
    __local Dtype* local_src1, 
    const __global Dtype* restrict src2, const int_tp offset_src2, 
    const int_tp out_gr, const int_tp map_size, const int_tp ch_gr) {
  int_tp gId = get_global_id(0);
  if (gId >= map_size) return; /* Do not remove this */
  int_tp out = get_global_id(1);
  if (out >= out_gr) return;   /* Do not remove this */
  dst += offset_dst;
  src1 += offset_src1;
  src2 += offset_src2;
  int_tp tId = get_local_id(0);
  int_tp local_out = get_local_id(1);
  int_tp tile_size = get_local_size(0);
  Dtype4 s1_cache;
  if (local_out == 0) {
    for (int_tp c = 0; c < ch_gr; ++c) {
      s1_cache = vload4(map_size * c + gId, src1);
      vstore4(s1_cache, tile_size * c + tId, local_src1); 
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  int_tp dst_map_offset = map_size * out;
  int_tp dst_idx = (dst_map_offset + gId) << 2;
  Dtype4 dst_cache = 0.f;
  Dtype4 s2_cache;
  int_tp ch_offset = 0; 
  int_tp map_offset = dst_map_offset * ch_gr; 
  for (int_tp c = 0; c < ch_gr; ++c) {
    ch_offset = map_size * c;
    s1_cache = vload4(tile_size * c + tId, local_src1);
    s2_cache = vload4(map_offset + ch_offset + gId, src2);
    dst_cache.xz += mad( s1_cache.xz, s2_cache.xz, s1_cache.yw * s2_cache.yw);
    dst_cache.yw += mad(-s1_cache.xz, s2_cache.yw, s1_cache.yw * s2_cache.xz);
  }
  ((__global Dtype4*)(&dst[dst_idx]))[0] += dst_cache; 
 }

 __kernel void TEMPLATE(complex_conjugate_multiplication_3d,Dtype)(__global Dtype* dst,
    const int_tp offset_dst, 
    const __global Dtype* src1, const int_tp offset_src1,
    const __global Dtype* src2, const int_tp offset_src2, 
    const int_tp out_gr, const int_tp size, const int_tp ch_gr) {
  dst  += offset_dst;
  src1 += offset_src1;
  src2 += offset_src2;
  int_tp gId = get_global_id(0);
  int_tp out = get_global_id(1);
  int_tp ch  = get_global_id(2);
  Dtype4 dst_cache = 0.f;
  Dtype4 s1_cache  = ((__global Dtype4*)(&(src1[(size*ch+gId)<<2])))[0];
  Dtype4 s2_cache  = ((__global Dtype4*)(&(src2[(size*(out*ch_gr+ch)+gId)<<2])))[0];
  dst_cache.x =  s1_cache.x * s2_cache.x + s1_cache.y * s2_cache.y;
  dst_cache.y = -s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x;
  dst_cache.z =  s1_cache.z * s2_cache.z + s1_cache.w * s2_cache.w;
  dst_cache.w = -s1_cache.z * s2_cache.w + s1_cache.w * s2_cache.z;
  ((__global Dtype4*)(&dst[(size*out+gId)<<2]))[0] += dst_cache;
 }

 __kernel void TEMPLATE(complex_conjugate_multiplication_3d_SLM,Dtype)(__global Dtype* dst,
    const int_tp offset_dst, __local Dtype* local_dst,  
    const __global Dtype* src1, const int_tp offset_src1, 
    __local Dtype* local_src1, const __global Dtype* src2, 
    const int_tp offset_src2, const int_tp out_gr, const int_tp map_size, 
    const int_tp ch_gr) {
  int_tp gId = get_global_id(0);
  if (gId >= map_size) return; /* Do not remove this */
  int_tp out = get_global_id(1);
  if (out >= out_gr) return;   /* Do not remove this */
  int_tp ch = get_global_id(2);
  if (ch >= ch_gr) return;     /* Do not remove this */
  dst += offset_dst;
  src1 += offset_src1;
  src2 += offset_src2;
  int_tp tId = get_local_id(0);
  int_tp local_out = get_local_id(1);
  int_tp tile_size = get_local_size(0);
  Dtype4 s1_cache;
  if (local_out == 0) {
    s1_cache = vload4(map_size * ch + gId, src1);
    vstore4(s1_cache, tile_size * ch + tId, local_src1);
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  int_tp dst_map_offset = map_size * out;
  int_tp dst_idx = (dst_map_offset + gId) << 2;
  Dtype4 dst_cache = 0.f;
  Dtype4 s2_cache;
  s1_cache = vload4(tile_size * ch + tId, local_src1);
  s2_cache = vload4((dst_map_offset * ch_gr) + (map_size * ch) + gId, src2);
  dst_cache.x +=  s1_cache.x * s2_cache.x + s1_cache.y * s2_cache.y;
  dst_cache.y += -s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x;
  dst_cache.z +=  s1_cache.z * s2_cache.z + s1_cache.w * s2_cache.w;
  dst_cache.w += -s1_cache.z * s2_cache.w + s1_cache.w * s2_cache.z;
  ((__global Dtype4*)(&dst[dst_idx]))[0] += dst_cache;
 }

 __kernel void TEMPLATE(complex_multiplication_1d,Dtype)(__global Dtype* dst,
    const int_tp offset_dst, 
    const __global Dtype* src1, const int_tp offset_src1, 
    const __global Dtype* src2, const int_tp offset_src2,
    const int_tp size, const int_tp ch_gr) {
  dst += offset_dst;
  src1 += offset_src1;
  src2 += offset_src2;
  int_tp gId = get_global_id(0);
  Dtype4 s2_cache;
  Dtype4 dst_cache = 0.f;
  int_tp idx_with_ch;
  Dtype4 s1_cache = vload4(gId, src1);
  for (int_tp ch = 0; ch < ch_gr; ++ch) {
    idx_with_ch = size * ch + gId;
    s2_cache = vload4(idx_with_ch, src2);
    dst_cache.xz = s1_cache.xz * s2_cache.xz - s1_cache.yw * s2_cache.yw;
    dst_cache.yw = s1_cache.xz * s2_cache.yw + s1_cache.yw * s2_cache.xz;
    ((__global Dtype4*)(&dst[idx_with_ch<<2]))[0] += dst_cache;
  }
 }

 __kernel void TEMPLATE(complex_multiplication_2d_SLM,Dtype)(__global Dtype* restrict dst,
    const int_tp offset_dst, __local Dtype* local_dst,
    const __global Dtype* restrict src1, const int_tp offset_src1, 
    const __global Dtype* restrict src2, const int_tp offset_src2,
    const int_tp num_output, const int_tp size, const int_tp ch_gr) {
  int_tp gId = get_global_id(0);
  if (gId >= size) return;
  int_tp out = get_global_id(1);
  if (out >= num_output) return;
  dst += offset_dst;
  src1 += offset_src1;
  src2 += offset_src2;
  int_tp tId = get_local_id(0);
  int_tp tOut = get_local_id(1);
  int_tp tile_size = get_local_size(0);
  int_tp local_out_size = get_local_size(1);
  int_tp out_offset = out * size;
  int_tp out_ch_offset = out_offset * ch_gr;
  int_tp tile_size_in_all_ch = tile_size * ch_gr;
  int_tp local_out_ch_offset = tOut * tile_size_in_all_ch;
  int_tp src2_idx, local_dst_idx;
  Dtype4 s2_cache, dst_cache;
  int_tp src1_idx = out_offset + gId;
  Dtype4 s1_cache = vload4(src1_idx, src1);
  for (int_tp ch = 0; ch < ch_gr; ++ch) {
    src2_idx = out_ch_offset + ch * size + gId;
    s2_cache = vload4(src2_idx, src2);
    dst_cache.xz = s1_cache.xz * s2_cache.xz - s1_cache.yw * s2_cache.yw;
    dst_cache.yw = s1_cache.xz * s2_cache.yw + s1_cache.yw * s2_cache.xz;
    local_dst_idx = local_out_ch_offset + ch * tile_size + tId;
    vstore4(dst_cache, local_dst_idx, local_dst);
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  int_tp start_idx, half_start_idx;
  int_tp ch_offset;
  int_tp this_idx, that_idx;
  for (int_tp offset = local_out_size >>= 1; offset > 0; offset >>=1) {
    if (tOut < offset) {
      start_idx = tOut * tile_size_in_all_ch + tId;
      half_start_idx = (tOut + offset) * tile_size_in_all_ch + tId;
      for (int_tp ch = 0; ch < ch_gr; ++ch) {
        ch_offset = ch * tile_size;
        this_idx = (start_idx + ch_offset) << 2;
        that_idx = (half_start_idx + ch_offset) << 2;
        ((__local Dtype4*)(&local_dst[this_idx]))[0] += 
            ((__local Dtype4*)(&local_dst[that_idx]))[0];
      }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
  }
  if (tOut == 0) {
    for (int_tp ch = 0; ch < ch_gr; ++ch) {
      dst_cache = vload4(tile_size * ch + tId, local_dst);
      ((__global Dtype4*)(&dst[(size * ch + gId)<<2]))[0] += dst_cache;
    }
  }
 }

 __kernel void TEMPLATE(complex_multiplication_3d,Dtype)(__global Dtype* dst,
    const int_tp offset_dst, 
    const __global Dtype* src1, const int_tp offset_src1, 
    const __global Dtype* src2, const int_tp offset_src2,
    const int_tp size, const int_tp ch_gr, const int_tp out_gr, const int_tp num_output) {
  dst  += offset_dst;
  src1 += offset_src1;
  src2 += offset_src2;
  int_tp gId = get_global_id(0);
  int_tp ch  = get_global_id(1);
  int_tp out = get_global_id(2);
  int_tp g = out / out_gr;
  ch += (g * ch_gr);
  int_tp c_offset = ch - ((ch / ch_gr) * ch_gr); 
  __global Dtype2* dst_ch = ((__global Dtype2*)(dst)) + (size * ch);
  __global Dtype2* src1_out = ((__global Dtype2*)(src1)) + (size * out);
  __global Dtype2* src2_out_ch = ((__global Dtype2*)(src2)) + (size * (out * ch_gr + c_offset));
  Dtype2 s1_cache  = src1_out[gId];
  Dtype2 s2_cache  = src2_out_ch[gId];
  Dtype2 dst_cache = 0.f;
  dst_cache.x = s1_cache.x * s2_cache.x - s1_cache.y * s2_cache.y;
  dst_cache.y = s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x;
  dst_ch[gId] += dst_cache;
 }

 /* Convert [RRRR...GGGG...BBBB...] to [RGBRGBRGBRGB...] */
 /* Reshape 2 */
 __kernel void TEMPLATE(convert_data_to_channel_major,Dtype)(__global Dtype2* dst, 
    const __global Dtype2* src, const int_tp size, const int_tp ch_gr) {
  int_tp gId = get_global_id(0);
  __global Dtype* dst_ptr = (__global Dtype*)(dst + (gId * ch_gr));
  const __global Dtype* src_ptr = (const __global Dtype*)(src + gId);
  Dtype2 s;
  int_tp src_idx = 0;
  for (int_tp i = 0; i < ch_gr; ++i) {
    s = vload2(src_idx, src_ptr);
    vstore2(s, i, dst_ptr);
    src_idx += size;
  }
 }
 /* Reshape 1 */
 /*__kernel void TEMPLATE(convert_data_to_channel_major(__global Dtype4* dst,
    const __global Dtype4* src, const int_tp size, const int_tp ch_gr) {
  int_tp gId = get_global_id(0);
  const __global Dtype4* src_ptr4 = src + gId; 
  __global Dtype4* dst_ptr4 = dst + (gId * ch_gr);
  for (int_tp i = 0; i < ch_gr; ++i) {
      dst_ptr4[i] = src_ptr4[i*size];
  }
 }
 */

 /* Convert multiple [RRRR...GGGG...BBBB...] to multiple [RGBRGBRGBRGB...] */
 /* Reshape 2 */
 __kernel void TEMPLATE(convert_weight_to_channel_major,Dtype)(__global Dtype2* dst, 
    const __global Dtype2* src, const int_tp size, const int_tp ch_gr,
    const int_tp num_output) {
  int_tp gId = get_global_id(0);
  int_tp out = get_global_id(1);
  int_tp out_offset = out * (size * ch_gr);
  __global Dtype* dst_ptr = (__global Dtype*)(dst + out_offset + (gId * ch_gr));
  const __global Dtype* src_ptr = 
      (const __global Dtype*)(src + out_offset + gId);
  Dtype2 s;
  int_tp src_idx = 0;
  for (int_tp i = 0; i < ch_gr; ++i) {
    s = vload2(src_idx, src_ptr);
    vstore2(s, i, dst_ptr);
    src_idx += size;
  }
 }
 /* Reshape 1 */
 /*
 __kernel void TEMPLATE(convert_weight_to_channel_major(__global Dtype4* dst,
    const __global Dtype4* src, const int_tp size, const int_tp ch_gr,
    const int_tp out_gr) {
  int_tp gId = get_global_id(0);
  int_tp out = get_global_id(1);
  int_tp out_offset = out * (size * ch_gr);
  __global Dtype4* dst_ptr4 = dst + out_offset + (gId * ch_gr);
  const __global Dtype4* src_ptr4 = src + out_offset + gId;
  for (int_tp i = 0; i < ch_gr; ++i) {
    dst_ptr4[i] = src_ptr4[size * i];
  }
 }
 */

 /* Cdotc per element */
 /* Reshape 1 */
 /*
 __kernel void TEMPLATE(batchedCdotc(__global Dtype4* dst, 
    const __global Dtype4* src1, const __global Dtype4* src2,  
    const int_tp size, const int_tp ch_gr, const int_tp out_gr) {  
  int_tp gId = get_global_id(0); 
  int_tp out = get_global_id(1); 
  int_tp ch_offset = gId * ch_gr; 
  int_tp out_offset = out * size; 
  const __global Dtype* src1_ptr = (const __global Dtype*)(src1 + ch_offset);  
  const __global Dtype* src2_ptr = (const __global Dtype*)(src2 + (out_offset * ch_gr) + ch_offset); 
  Dtype4 cdotc = 0.f; 
  Dtype4 s1, s2; 
  for (int_tp c = 0; c < ch_gr; ++c) { 
    s1 = vload4(c, src1_ptr); 
    s2 = vload4(c, src2_ptr); 
    cdotc.xz += mad( s1.xz, s2.xz, s1.yw * s2.yw); 
    cdotc.yw += mad(-s1.xz, s2.yw, s1.yw * s2.xz); 
  } 
  __global Dtype4* dst_ptr4 = dst + out_offset + gId; 
  dst_ptr4[0] += cdotc; 
 }
 */

 /* Cdotc per two elements */
 /* Reshape 2 */
 __kernel void TEMPLATE(batchedCdotc,Dtype)(__global Dtype2* dst,
    const __global Dtype2* src1, const __global Dtype2* src2, 
    const int_tp size, const int_tp ch_gr, const int_tp out_gr) {
  int_tp gId = get_global_id(0);
  int_tp out = get_global_id(1);
  int_tp ch_offset = gId * ch_gr;
  const __global Dtype* src1_ptr = (const __global Dtype*)(src1 + ch_offset); 
  const __global Dtype* src2_ptr = 
      (const __global Dtype*)(src2 + (out * size * ch_gr) + ch_offset);
  Dtype4 cdotc4 = 0.f;
  Dtype2 cdotc = 0.f;
  Dtype4 s1, s2;
  int_tp n = ch_gr >> 1;
  int_tp r = ch_gr - (n << 1);
  for (int_tp i = 0; i < n; ++i) {
    s1 = vload4(i, src1_ptr);
    s2 = vload4(i, src2_ptr);
    cdotc4.xz += mad( s1.xz, s2.xz, s1.yw * s2.yw);
    cdotc4.yw += mad(-s1.xz, s2.yw, s1.yw * s2.xz);
  }
  cdotc.x += dot(cdotc4.xz, (float2)(1));
  cdotc.y += dot(cdotc4.yw, (float2)(1));
  if (r == 1) {
    const __global Dtype* src1_ptr2 = 
        (const __global Dtype*)(((const __global Dtype4*)(src1_ptr)) + n);
    const __global Dtype* src2_ptr2 = 
        (const __global Dtype*)(((const __global Dtype4*)(src2_ptr)) + n);
    Dtype2 t1 = vload2(0, src1_ptr2); 
    Dtype2 t2 = vload2(0, src2_ptr2);
    cdotc.x += mad( t1.x, t2.x, t1.y * t2.y);
    cdotc.y += mad(-t1.x, t2.y, t1.y * t2.x);
  }
  __global Dtype* dst_ptr = (__global Dtype*)(dst + (out * size) + gId);
  vstore2(cdotc, 0, dst_ptr);
 }
 #endif

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(fillbuffer,Dtype)(const int_tp n, const char alpha, __global char* x,
                                   const int_tp offx) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    x[index + offx] = alpha;
  }
 }

 __kernel void TEMPLATE(fill,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* x,
                                   const int_tp offx) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    x[index + offx] = alpha;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(im2col,Dtype)(const int_tp n,
                                     __global const Dtype* data_im,
                                     const int_tp data_im_off,
                                     const int_tp height, const int_tp width,
                                     const int_tp kernel_h,
                                     const int_tp kernel_w, const int_tp pad_h,
                                     const int_tp pad_w, const int_tp stride_h,
                                     const int_tp stride_w,
                                     const int_tp dilation_h,
                                     const int_tp dilation_w,
                                     const int_tp height_col,
                                     const int_tp width_col,
                                     __global Dtype* data_col,
                                     const int_tp data_col_off) {

  for (int_tp index = get_global_id(0); index < n;
      index += get_global_size(0)) {
    const int_tp h_index = index / width_col;
    const int_tp h_col = h_index % height_col;
    const int_tp w_col = index % width_col;
    const int_tp c_im = h_index / height_col;
    const int_tp c_col = c_im * kernel_h * kernel_w;
    const int_tp h_offset = h_col * stride_h - pad_h;
    const int_tp w_offset = w_col * stride_w - pad_w;
    __global Dtype* data_col_ptr = data_col + data_col_off;
    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
    __global const Dtype* data_im_ptr = data_im + data_im_off;
    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
    for (int_tp i = 0; i < kernel_h; ++i) {
      for (int_tp j = 0; j < kernel_w; ++j) {
        int_tp h_im = h_offset + i * dilation_h;
        int_tp w_im = w_offset + j * dilation_w;
        *data_col_ptr =
            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
                data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
        data_col_ptr += height_col * width_col;
      }
    }
  }
 }

 __kernel void TEMPLATE(col2im,Dtype)(const int_tp n,
                                     __global const Dtype* data_col,
                                     const int_tp data_col_off,
                                     const int_tp height, const int_tp width,
                                     const int_tp channels,
                                     const int_tp kernel_h,
                                     const int_tp kernel_w, const int_tp pad_h,
                                     const int_tp pad_w, const int_tp stride_h,
                                     const int_tp stride_w,
                                     const int_tp dilation_h,
                                     const int_tp dilation_w,
                                     const int_tp height_col,
                                     const int_tp width_col,
                                     __global Dtype* data_im,
                                     const int_tp data_im_off) {

  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    Dtype val = 0;
    const int_tp w_im = index % width + pad_w;
    const int_tp h_im = (index / width) % height + pad_h;
    const int_tp c_im = index / (width * height);
    int_tp kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
    int_tp kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
    // compute the start and end of the output
    const int_tp w_col_start =
        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
    const int_tp w_col_end = min(w_im / stride_w + 1, width_col);
    const int_tp h_col_start =
        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
    const int_tp h_col_end = min(h_im / stride_h + 1, height_col);
    // TODO: use LCM of stride and dilation to avoid unnecessary loops
    for (int_tp h_col = h_col_start; h_col < h_col_end; h_col += 1) {
      for (int_tp w_col = w_col_start; w_col < w_col_end; w_col += 1) {
        int_tp h_k = (h_im - h_col * stride_h);
        int_tp w_k = (w_im - w_col * stride_w);
        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
          h_k /= dilation_h;
          w_k /= dilation_w;
          int_tp data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
                                height_col + h_col) * width_col + w_col;
          val += data_col[data_col_off + data_col_index];
        }
      }
    }
    data_im[data_im_off + index] = val;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(im2col_nd, Dtype)(const int_tp n, const int_tp num_axes,
                                         const int_tp channel_axis,
                                         __global const Dtype* data_im,
                                         const int_tp data_im_off,
                                         __global const int_tp* im_shape,
                                         __global const int_tp* col_shape,
                                         __global const int_tp* kernel_shape,
                                         __global const int_tp* pad,
                                         __global const int_tp* stride,
                                         __global const int_tp* dilation,
                                         __global Dtype* data_col,
                                         const int_tp data_col_off) {
  int_tp d_temp[6];
  int_tp d_iter[6];
  int_tp i;

  __global const int_tp* im_shape_ptr = im_shape + channel_axis;
  __global const int_tp* col_shape_ptr = col_shape + channel_axis;

  __local int_tp shared_dilation[6];
  __local int_tp shared_kernel_shape[6];
  __local int_tp shared_pad[6];
  __local int_tp shared_stride[6];
  __local int_tp shared_col_shape[6 + 1];
  __local int_tp shared_im_shape[6 + 1];

  for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) {
    shared_dilation[li] = dilation[li];
    shared_kernel_shape[li] = kernel_shape[li];
    shared_pad[li] = pad[li];
    shared_stride[li] = stride[li];
  }

  for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) {
    shared_col_shape[li] = col_shape_ptr[li];
    shared_im_shape[li] = im_shape_ptr[li];
  }

  barrier(CLK_LOCAL_MEM_FENCE);

  for (int_tp index = get_global_id(0); index < n;
      index += get_global_size(0)) {
    // Initialize channel_in, computed in the loop below, with intermediate
    // computations used to compute the spatial indices.
    int_tp channel_in = index;
    int_tp channel_out = 1;
    for (i = num_axes - 1; i >= 0; --i) {
      d_temp[i] = channel_in % shared_col_shape[i + 1];
      channel_in /= shared_col_shape[i + 1];
      channel_out *= shared_kernel_shape[i];
    }
    channel_out *= channel_in;
    int_tp data_col_inc = 1;
    for (i = 0; i < num_axes; ++i) {
      channel_out *= shared_col_shape[i + 1];
      channel_out += d_temp[i];
      d_temp[i] = d_temp[i] * shared_stride[i] - shared_pad[i];
      channel_in *= shared_im_shape[i + 1];
      channel_in += d_temp[i];
      data_col_inc *= shared_col_shape[i + 1];
      d_iter[i] = 0;
    }
    __global Dtype* data_col_ptr = data_col + data_col_off + channel_out;
    __global const Dtype* data_im_ptr = data_im + data_im_off + channel_in;
    bool incremented;
    do {
      bool in_range = true;
      for (i = 0; i < num_axes; ++i) {
        const int_tp d_iter_im = d_iter[i] * shared_dilation[i] + d_temp[i];
        in_range &= d_iter_im >= 0 && d_iter_im < shared_im_shape[i + 1];
        if (!in_range) {
          break;
        }
      }
      if (in_range) {
        int_tp data_im_offset = d_iter[0] * shared_dilation[0];
        for (i = 1; i < num_axes; ++i) {
          data_im_offset *= shared_im_shape[i + 1];
          data_im_offset += d_iter[i] * shared_dilation[i];
        }
        *data_col_ptr = data_im_ptr[data_im_offset];
      } else {
        *data_col_ptr = 0;
      }
      data_col_ptr += data_col_inc;
      incremented = false;
      for (i = num_axes - 1; i >= 0; --i) {
        const int_tp d_max = shared_kernel_shape[i];
        if (d_iter[i] == d_max - 1) {
          d_iter[i] = 0;
        } else {  // d_iter[i] < d_max - 1
          ++d_iter[i];
          incremented = true;
          break;
        }
      }  // for (int_tp i = num_axes - 1; i >= 0; --i)
    } while (incremented);  // do
  }
 }

 __kernel void TEMPLATE(col2im_nd, Dtype)(const int_tp n, const int_tp num_axes,
                                         const int_tp channel_axis,
                                         __global const Dtype* data_col,
                                         const int_tp data_col_off,
                                         __global const int_tp* im_shape,
                                         __global const int_tp* col_shape,
                                         __global const int_tp* kernel_shape,
                                         __global const int_tp* pad,
                                         __global const int_tp* stride,
                                         __global const int_tp* dilation,
                                         __global Dtype* data_im,
                                         const int_tp data_im_off) {
  int_tp d_im[6];
  int_tp d_col_iter[6];
  int_tp d_col_start[6];
  int_tp d_col_end[6];

  __global const int_tp* im_shape_ptr = im_shape + channel_axis;
  __global const int_tp* col_shape_ptr = col_shape + channel_axis;

  __local int_tp shared_dilation[6];
  __local int_tp shared_kernel_shape[6];
  __local int_tp shared_pad[6];
  __local int_tp shared_stride[6];
  __local int_tp shared_col_shape[6 + 1];
  __local int_tp shared_im_shape[6 + 1];

  for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) {
    shared_dilation[li] = dilation[li];
    shared_kernel_shape[li] = kernel_shape[li];
    shared_pad[li] = pad[li];
    shared_stride[li] = stride[li];
  }
  for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) {
    shared_col_shape[li] = col_shape_ptr[li];
    shared_im_shape[li] = im_shape_ptr[li];
  }

  barrier(CLK_LOCAL_MEM_FENCE);

  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    // Initialize channel_in, computed in the loop below, with intermediate
    // computations used to compute the spatial indices.
    int_tp c_im = index;
    // Calculate d_im (image dimensions).
    for (int_tp i = num_axes - 1; i >= 0; --i) {
      d_im[i] = c_im % shared_im_shape[i + 1] + shared_pad[i];
      c_im /= shared_im_shape[i + 1];
    }
    // Calculate col start/end indices.
    bool done = false;
    for (int_tp i = 0; i < num_axes; ++i) {
      const int_tp kernel_extent = shared_dilation[i]
          * (shared_kernel_shape[i] - 1) + 1;
      d_col_start[i] = d_col_iter[i] =
          (d_im[i] < kernel_extent) ?
              0 : (d_im[i] - kernel_extent) / shared_stride[i] + 1;
      d_col_end[i] = min(d_im[i] / shared_stride[i] + 1,
                         shared_col_shape[i + 1]);
      if (d_col_start[i] >= d_col_end[i]) {
        // Skip computation if the dimension is 0 at any spatial axis --
        // final val will be 0.
        data_im[index] = 0;
        done = true;
        break;  // for (int_tp i = 0; i < num_axes; ++i)
      }
    }
    if (!done) {
      // Loop over the col to compute the output val.
      Dtype val = 0;
      bool incremented = true;
      bool skip = false;
      do {
        // Compute the final offset.
        int_tp final_offset = 0;
        int_tp kernel_shape_prod = 1;
        int_tp kernel_index;
        for (int_tp i = num_axes - 1; i >= 0; --i) {
          kernel_index = d_im[i] - d_col_iter[i] * shared_stride[i];
          if (kernel_index % shared_dilation[i]) {
            skip = true;
            break;
          } else {
            kernel_index /= shared_dilation[i];
            final_offset += kernel_index * kernel_shape_prod;
            kernel_shape_prod *= shared_kernel_shape[i];
          }
        }
        if (!skip) {
          final_offset += kernel_shape_prod * c_im;
          for (int_tp i = 0; i < num_axes; ++i) {
            final_offset *= shared_col_shape[i + 1];
            final_offset += d_col_iter[i];
          }
          val += data_col[data_col_off + final_offset];
        }
        skip = false;
        incremented = false;
        for (int_tp i = num_axes - 1; i >= 0; --i) {
          const int_tp d_max = d_col_end[i];
          if (d_col_iter[i] == d_max - 1) {
            d_col_iter[i] = d_col_start[i];
          } else {  // d_col_iter[i] < d_max - 1
            ++d_col_iter[i];
            incremented = true;
            break;  // for (int_tp i = num_axes - 1; i >= 0; --i)
          }
        }  // for (int_tp i = num_axes - 1; i >= 0; --i)
      } while (incremented);
      data_im[data_im_off + index] = val;
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(lrn_compute_output,Dtype)(const int_tp nthreads,
                                                 __global const Dtype* in,
                                                 __global const Dtype* scale,
                                                 const Dtype negative_beta,
                                                 __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    out[index] = in[index] * pow(scale[index], negative_beta);
  }
 }

 __kernel void TEMPLATE(lrn_fill_scale,Dtype)(const int_tp nthreads, __global const Dtype* in,
                             const int_tp num, const int_tp channels,
                             const int_tp height, const int_tp width, const int_tp size,
                             const Dtype alpha_over_size, const Dtype k,
                             __global Dtype* const scale) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    // find out the local offset
    const int_tp w = index % width;
    const int_tp h = (index / width) % height;
    const int_tp n = index / width / height;
    const int_tp offset = (n * channels * height + h) * width + w;
    const int_tp step = height * width;
    __global const Dtype* in_off = in + offset;
    __global Dtype* scale_off = scale + offset;
    int_tp head = 0;
    const int_tp pre_pad = (size - 1) / 2;
    const int_tp post_pad = size - pre_pad - 1;
    Dtype accum_scale = 0;
    // fill the scale at [n, :, h, w]
    // accumulate values
    while (head < post_pad && head < channels) {
      accum_scale += in_off[head * step] * in_off[head * step];
      ++head;
    }
    // both add and subtract
    while (head < channels) {
      accum_scale += in_off[head * step] * in_off[head * step];
      if (head - size >= 0) {
        accum_scale -= in_off[(head - size) * step]
            * in_off[(head - size) * step];
      }
      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
      ++head;
    }
    // subtract only
    while (head < channels + post_pad) {
      if (head - size >= 0) {
        accum_scale -= in_off[(head - size) * step]
            * in_off[(head - size) * step];
      }
      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
      ++head;
    }
  }
 }

 __kernel void TEMPLATE(lrn_compute_diff,Dtype)(const int_tp nthreads,
                               __global const Dtype* bottom_data,
                               __global const Dtype* top_data,
                               __global const Dtype* scale,
                               __global const Dtype* top_diff, const int_tp num,
                               const int_tp channels, const int_tp height,
                               const int_tp width, const int_tp size,
                               const Dtype negative_beta,
                               const Dtype cache_ratio,
                               __global Dtype* bottom_diff) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    // find out the local offset
    const int_tp w = index % width;
    const int_tp h = (index / width) % height;
    const int_tp n = index / width / height;
    const int_tp offset = (n * channels * height + h) * width + w;
    const int_tp step = height * width;
    __global const Dtype* bottom_off = bottom_data + offset;
    __global const Dtype* top_off = top_data + offset;
    __global const Dtype* scale_off = scale + offset;
    __global const Dtype* top_diff_off = top_diff + offset;
    __global Dtype* bottom_diff_off = bottom_diff + offset;
    int_tp head = 0;
    const int_tp pre_pad = size - (size + 1) / 2;
    const int_tp post_pad = size - pre_pad - 1;
    Dtype accum_ratio = 0;
    // accumulate values
    while (head < post_pad && head < channels) {
      accum_ratio += top_diff_off[head * step] * top_off[head * step]
          / scale_off[head * step];
      ++head;
    }
    // both add and subtract
    while (head < channels) {
      accum_ratio += top_diff_off[head * step] * top_off[head * step]
          / scale_off[head * step];
      if (head - size >= 0) {
        accum_ratio -= top_diff_off[(head - size) * step]
            * top_off[(head - size) * step] / scale_off[(head - size) * step];
      }
      bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad)
          * step] * pow(scale_off[(head - post_pad) * step], negative_beta)
          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
      ++head;
    }
    // subtract only
    while (head < channels + post_pad) {
      if (head - size >= 0) {
        accum_ratio -= top_diff_off[(head - size) * step]
            * top_off[(head - size) * step] / scale_off[(head - size) * step];
      }
      bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad)
          * step] * pow(scale_off[(head - post_pad) * step], negative_beta)
          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
      ++head;
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 inline Dtype TEMPLATE(lstm_sigmoid,Dtype)(const Dtype x) {
  return (Dtype)1 / ((Dtype)1 + exp(-x));
 }

 inline Dtype TEMPLATE(lstm_tanh,Dtype)(const Dtype x) {
  return (Dtype)2 * TEMPLATE(lstm_sigmoid,Dtype)((Dtype)2 * x) - (Dtype)1;
 }

 __kernel void TEMPLATE(lstm_acts_forward,Dtype)(const int_tp nthreads, const int_tp dim,
                                __global const Dtype* X, __global Dtype* X_acts) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp x_dim = 4 * dim;
    const int_tp d = index % x_dim;
    if (d < 3 * dim) {
      X_acts[index] = TEMPLATE(lstm_sigmoid,Dtype)(X[index]);
    } else {
      X_acts[index] = TEMPLATE(lstm_tanh,Dtype)(X[index]);
    }
  }
 }

 __kernel void TEMPLATE(lstm_unit_forward,Dtype)(const int_tp nthreads, const int_tp dim,
    __global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* cont,
    __global Dtype* C, __global Dtype* H) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp n = index / dim;
    const int_tp d = index % dim;
    __global const Dtype* X_offset = X + 4 * dim * n;
    const Dtype i = X_offset[d];
    const Dtype f = X_offset[1 * dim + d];
    const Dtype o = X_offset[2 * dim + d];
    const Dtype g = X_offset[3 * dim + d];
    const Dtype c_prev = C_prev[index];
    const Dtype c = cont[n] * f * c_prev + i * g;
    C[index] = c;
    const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c);
    H[index] = o * tanh_c;
  }
 }

 __kernel void TEMPLATE(lstm_unit_backward,Dtype)(const int_tp nthreads, const int_tp dim,
    __global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* C, __global const Dtype* H,
    __global const Dtype* cont, __global const Dtype* C_diff, __global const Dtype* H_diff,
    __global Dtype* C_prev_diff, __global Dtype* X_diff) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp n = index / dim;
    const int_tp d = index % dim;
    __global const Dtype* X_offset = X + 4 * dim * n;
    const Dtype i = X_offset[d];
    const Dtype f = X_offset[1 * dim + d];
    const Dtype o = X_offset[2 * dim + d];
    const Dtype g = X_offset[3 * dim + d];
    const Dtype c_prev = C_prev[index];
    const Dtype c = C[index];
    const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c);
    __global Dtype* c_prev_diff = C_prev_diff + index;
    __global Dtype* X_diff_offset = X_diff + 4 * dim * n;
    __global Dtype* i_diff = X_diff_offset + d;
    __global Dtype* f_diff = X_diff_offset + 1 * dim + d;
    __global Dtype* o_diff = X_diff_offset + 2 * dim + d;
    __global Dtype* g_diff = X_diff_offset + 3 * dim + d;
    const Dtype c_term_diff =
        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
    const Dtype cont_n = cont[n];
    *c_prev_diff = cont_n * c_term_diff * f;
    *i_diff = c_term_diff * g;
    *f_diff = cont_n * c_term_diff * c_prev;
    *o_diff = H_diff[index] * tanh_c;
    *g_diff = c_term_diff * i;
  }
 }

 __kernel void TEMPLATE(lstm_acts_backward,Dtype)(const int_tp nthreads, const int_tp dim,
          __global const Dtype* X_acts, __global const Dtype* X_acts_diff, __global Dtype* X_diff) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp x_dim = 4 * dim;
    const int_tp d = index % x_dim;
    const Dtype X_act = X_acts[index];
    if (d < 3 * dim) {
      X_diff[index] = X_acts_diff[index] * X_act * ((Dtype)1 - X_act);
    } else {
      X_diff[index] = X_acts_diff[index] * ((Dtype)1 - X_act * X_act);
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(mul,Dtype)(const int_tp n, __global const Dtype* a,
                                  const int_tp offa,
                                  __global Dtype* b,
                                  const int_tp offb, __global Dtype* y,
                                  const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[index + offy] = a[index + offa] * b[index + offb];
  }
 }

 __kernel void TEMPLATE(div,Dtype)(const int_tp n, __global const Dtype* a,
                                  const int_tp offa,
                                  __global Dtype* b,
                                  const int_tp offb, __global Dtype* y,
                                  const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[index + offy] = a[index + offa] / b[index + offb];
  }
 }

 __kernel void TEMPLATE(add_scalar,Dtype)(const int_tp N, const Dtype alpha,
 __global Dtype* Y,
                                         const int_tp offY) {
  for (int_tp index = get_global_id(0); index < N; index += get_global_size(0)) {
    Y[offY + index] += alpha;
  }
 }

 __kernel void TEMPLATE(add,Dtype)(const int_tp n, __global const Dtype* a,
                                  const int_tp offa, __global const Dtype* b,
                                  const int_tp offb, __global Dtype* y,
                                  const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[offy + index] = a[offa + index] + b[offb + index];
  }
 }

 __kernel void TEMPLATE(sub,Dtype)(const int_tp n, __global const Dtype* a,
                                  const int_tp offa, __global const Dtype* b,
                                  const int_tp offb, __global Dtype* y,
                                  const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[offy + index] = a[offa + index] - b[offb + index];
  }
 }

 __kernel void TEMPLATE(abs,Dtype)(const int_tp n, __global const Dtype* a,
                                  const int_tp offa, __global Dtype* y,
                                  const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[offy + index] = fabs((Dtype)(a[offa + index]));
  }
 }

 __kernel void TEMPLATE(exp,Dtype)(const int_tp n, __global const Dtype* a,
                                  const int_tp offa, __global Dtype* y,
                                  const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[offy + index] = exp(a[offa + index]);
  }
 }

 __kernel void TEMPLATE(log,Dtype)(const int_tp n, __global const Dtype* a,
                                  const int_tp offa, __global Dtype* y,
                                  const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[offy + index] = log((Dtype)(a[offa + index]));
  }
 }

 __kernel void TEMPLATE(powx,Dtype)(const int_tp n, __global const Dtype* a,
                                   const int_tp offa, Dtype alpha,
                                   __global Dtype* y,
                                   const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    if(alpha == 2.0) {
      y[offy + index] = pow((Dtype)fabs(a[offa + index]), (Dtype)alpha);
    } else {
      y[offy + index] = pow((Dtype)a[offa + index], (Dtype)alpha);
    }
  }
 }

 __kernel void TEMPLATE(sign,Dtype)(const int_tp n, __global const Dtype* x,
                                   const int_tp offx, __global Dtype* y,
                                   const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[index + offy] = (0.0 < x[index + offx])
        - (x[index + offx] < 0.0);
  }
 }

 __kernel void TEMPLATE(sgnbit,Dtype)(const int_tp n, __global const Dtype* x,
                                     const int_tp offx, __global Dtype* y,
                                     const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[index + offy] = signbit(x[index + offx]);
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(merge_copy_forward_stack, Dtype)(const int_tp nthreads,
                                                  const int_tp dims,
                                                  __global const Dtype* bottom_a,
                                                  const int_tp forward_a,
                                                  __global const Dtype* bottom_b,
                                                  const int_tp forward_b,
                                                  __global Dtype* top,
                                                  const int_tp num,
                                                  const int_tp channels_a,
                                                  const int_tp channels_b,
                                                  __global const int_tp* shape_a,
                                                  __global const int_tp* shape_b) {
  int_tp pad[6];
  int_tp tmp_idx[6];
  int_tp size_a = 1;
  int_tp size_b = 1;

  for (int_tp i = 0; i < dims; ++i) {
    pad[i] = (shape_b[i] - shape_a[i]) / 2;
    size_a *= shape_a[i];
    size_b *= shape_b[i];
  }

  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    int_tp batch_id = index / ((channels_a + channels_b) * size_a);
    int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a)
        / (channels_a * size_a)) % 2;
    int_tp counter = index;
    for (int_tp i = dims - 1; i >= 0; --i) {
      tmp_idx[i] = counter % shape_a[i];
      counter /= shape_a[i];
    }

    if (bottom_id == 0) {
      int_tp channel_id = (index / size_a) % channels_a;
      int_tp aidx = batch_id * channels_a + channel_id;
      for (int_tp i = 0; i < dims; ++i) {
        aidx *= shape_a[i];
        aidx += tmp_idx[i];
      }
      top[index] = (forward_a == 1) ? bottom_a[aidx] : 0;
    } else {
      int_tp channel_id = (index / size_a) % channels_b;
      int_tp bidx = (batch_id * channels_b + channel_id) * size_b;
      int_tp btemp = 1;
      for (int_tp i = dims - 1; i >= 0; --i) {
        bidx += btemp * (tmp_idx[i] + pad[i]);
        btemp *= shape_b[i];
      }
      top[index] = (forward_b == 1) ? bottom_b[bidx] : 0;
    }
  }
 }

 __kernel void TEMPLATE(merge_copy_backward_stack,Dtype)(const int_tp nthreads,
                                                  const int_tp dims,
                                                  __global Dtype* bottom_a,
                                                  const int_tp backward_a,
                                                  __global Dtype* bottom_b,
                                                  const int_tp backward_b,
                                                  __global const Dtype* top,
                                                  const int_tp num,
                                                  const int_tp channels_a,
                                                  const int_tp channels_b,
                                                  __global const int_tp* shape_a,
                                                  __global const int_tp* shape_b) {
  int_tp pad[6];
  int_tp tmp_idx[6];
  int_tp size_a = 1;
  int_tp size_b = 1;

  for (int_tp i = 0; i < dims; ++i) {
    pad[i] = (shape_b[i] - shape_a[i]) / 2;
    size_a *= shape_a[i];
    size_b *= shape_b[i];
  }

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {
    int_tp batch_id = index / ((channels_a + channels_b) * size_a);
    int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a)
        / (channels_a * size_a)) % 2;
    int_tp counter = index;
    for (int_tp i = dims - 1; i >= 0; --i) {
      tmp_idx[i] = counter % shape_a[i];
      counter /= shape_a[i];
    }

    if (bottom_id == 0) {
      int_tp channel_id = (index / size_a) % channels_a;
      int_tp aidx = batch_id * channels_a + channel_id;
      for (int_tp i = 0; i < dims; ++i) {
        aidx *= shape_a[i];
        aidx += tmp_idx[i];
      }
      bottom_a[aidx] = (backward_a == 1) ? top[index] : 0;
    } else {
      int_tp channel_id = (index / size_a) % channels_b;
      int_tp bidx = (batch_id * channels_b + channel_id) * size_b;
      int_tp btemp = 1;
      for (int_tp i = dims - 1; i >= 0; --i) {
        bidx += btemp * (tmp_idx[i] + pad[i]);
        btemp *= shape_b[i];
      }
      bottom_b[bidx] = (backward_b == 1) ? top[index] : 0;
    }
  }
 }


 __kernel void TEMPLATE(merge_copy_forward_add, Dtype)(const int_tp nthreads,
                                                  const int_tp dims,
                                                  __global const Dtype* bottom_a,
                                                  const int_tp forward_a,
                                                  __global const Dtype* bottom_b,
                                                  const int_tp forward_b,
                                                  __global Dtype* top,
                                                  const int_tp num,
                                                  const int_tp channels,
                                                  __global const int_tp* shape_a,
                                                  __global const int_tp* shape_b) {
  int_tp pad[6];
  int_tp tmp_idx[6];
  int_tp size_a = 1;
  int_tp size_b = 1;

  for (int_tp i = 0; i < dims; ++i) {
    pad[i] = (shape_b[i] - shape_a[i]) / 2;
    size_a *= shape_a[i];
    size_b *= shape_b[i];
  }

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {
    int_tp batch_id = index / (channels * size_a);
    int_tp counter = index;
    for (int_tp i = dims - 1; i >= 0; --i) {
      tmp_idx[i] = counter % shape_a[i];
      counter /= shape_a[i];
    }

    top[index] = 0;
    int_tp channel_id = (index / size_a) % channels;
    int_tp aidx = batch_id * channels + channel_id;
    for (int_tp i = 0; i < dims; ++i) {
      aidx *= shape_a[i];
      aidx += tmp_idx[i];
    }
    top[index] = forward_a ? top[index] + bottom_a[aidx] : top[index];
    int_tp bidx = (batch_id * channels + channel_id) * size_b;
    int_tp btemp = 1;
    for (int_tp i = dims - 1; i >= 0; --i) {
      bidx += btemp * (tmp_idx[i] + pad[i]);
      btemp *= shape_b[i];
    }
    top[index] = forward_b ? top[index] + bottom_b[bidx] : top[index];
  }
 }

 __kernel void TEMPLATE(merge_copy_backward_add,Dtype)(const int_tp nthreads,
                                                  const int_tp dims,
                                                  __global Dtype* bottom_a,
                                                  const int_tp backward_a,
                                                  __global Dtype* bottom_b,
                                                  const int_tp backward_b,
                                                  __global const Dtype* top,
                                                  const int_tp num,
                                                  const int_tp channels,
                                                  __global const int_tp* shape_a,
                                                  __global const int_tp* shape_b) {
  int_tp pad[6];
  int_tp tmp_idx[6];
  int_tp size_a = 1;
  int_tp size_b = 1;

  for (int_tp i = 0; i < dims; ++i) {
    pad[i] = (shape_b[i] - shape_a[i]) / 2;
    size_a *= shape_a[i];
    size_b *= shape_b[i];
  }

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {
    int_tp batch_id = index / (channels * size_a);
    int_tp counter = index;
    for (int_tp i = dims - 1; i >= 0; --i) {
      tmp_idx[i] = counter % shape_a[i];
      counter /= shape_a[i];
    }

    int_tp channel_id = (index / size_a) % channels;
    int_tp aidx = batch_id * channels + channel_id;
    for (int_tp i = 0; i < dims; ++i) {
      aidx *= shape_a[i];
      aidx += tmp_idx[i];
    }
    bottom_a[aidx] = backward_a ? top[index] : 0;
    int_tp bidx = (batch_id * channels + channel_id) * size_b;
    int_tp btemp = 1;
    for (int_tp i = dims - 1; i >= 0; --i) {
      bidx += btemp * (tmp_idx[i] + pad[i]);
      btemp *= shape_b[i];
    }
    bottom_b[bidx] = backward_b ? top[index] : 0;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(max_pool_forward,Dtype)(
    const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
    const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h,
    const int_tp pad_w,
    __global Dtype* top_data,
    const int use_mask, __global int_tp* mask, __global Dtype* top_mask) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp pw = index % pooled_width;
    const int_tp ph = (index / pooled_width) % pooled_height;
    const int_tp c = (index / pooled_width / pooled_height) % channels;
    const int_tp n = index / pooled_width / pooled_height / channels;
    int_tp hstart = ph * stride_h - pad_h;
    int_tp wstart = pw * stride_w - pad_w;
    const int_tp hend = min(hstart + kernel_h, height);
    const int_tp wend = min(wstart + kernel_w, width);
    hstart = max(hstart, (int_tp)0);
    wstart = max(wstart, (int_tp)0);
    Dtype maxval = -FLT_MAX;
    int_tp maxidx = -1;
    __global const Dtype* bottom_slice = bottom_data
        + (n * channels + c) * height * width;
    for (int_tp h = hstart; h < hend; ++h) {
      for (int_tp w = wstart; w < wend; ++w) {
        if (bottom_slice[h * width + w] > maxval) {
          maxidx = h * width + w;
          maxval = bottom_slice[maxidx];
        }
      }
    }
    top_data[index] = maxval;
    if (use_mask == 1) {
      mask[index] = maxidx;
    } else {
      top_mask[index] = maxidx;
    }
  }
 }

 __kernel void TEMPLATE(ave_pool_forward,Dtype)(
    const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
    const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h,
    const int_tp pad_w, __global Dtype* top_data) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    {
      const int_tp pw = index % pooled_width;
      const int_tp ph = (index / pooled_width) % pooled_height;
      const int_tp c = (index / pooled_width / pooled_height) % channels;
      const int_tp n = index / pooled_width / pooled_height / channels;
      int_tp hstart = ph * stride_h - pad_h;
      int_tp wstart = pw * stride_w - pad_w;
      int_tp hend = min(hstart + kernel_h, height + pad_h);
      int_tp wend = min(wstart + kernel_w, width + pad_w);
      const int_tp pool_size = (hend - hstart) * (wend - wstart);
      hstart = max(hstart, (int_tp)0);
      wstart = max(wstart, (int_tp)0);
      hend = min(hend, height);
      wend = min(wend, width);
      Dtype aveval = 0;
      __global const Dtype* bottom_slice = bottom_data
          + (n * channels + c) * height * width;
      for (int_tp h = hstart; h < hend; ++h) {
        for (int_tp w = wstart; w < wend; ++w) {
          aveval += bottom_slice[h * width + w];
        }
      }
      top_data[index] = aveval / pool_size;
    }
  }
 }

 __kernel void TEMPLATE(sto_pool_forward_train,Dtype)(
    const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
    const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w,
    __global Dtype* rand_idx,
    __global Dtype* top_data) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp pw = index % pooled_width;
    const int_tp ph = (index / pooled_width) % pooled_height;
    const int_tp c = (index / pooled_width / pooled_height) % channels;
    const int_tp n = index / pooled_width / pooled_height / channels;
    const int_tp hstart = ph * stride_h;
    const int_tp hend = min(hstart + kernel_h, height);
    const int_tp wstart = pw * stride_w;
    const int_tp wend = min(wstart + kernel_w, width);
    Dtype cumsum = 0.;
    __global const Dtype* bottom_slice = bottom_data
        + (n * channels + c) * height * width;
    // First pass: get sum
    for (int_tp h = hstart; h < hend; ++h) {
      for (int_tp w = wstart; w < wend; ++w) {
        cumsum += bottom_slice[h * width + w];
      }
    }
    const float thres = rand_idx[index] * cumsum;
    // Second pass: get value, and set index.
    cumsum = 0;
    for (int_tp h = hstart; h < hend; ++h) {
      for (int_tp w = wstart; w < wend; ++w) {
        cumsum += bottom_slice[h * width + w];
        if (cumsum >= thres) {
          rand_idx[index] = ((n * channels + c) * height + h) * width + w;
          top_data[index] = bottom_slice[h * width + w];
          h = hend;
          w = wend;
        }
      }
    }
  }
 }

 __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
    const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
    const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w,
    __global Dtype* top_data) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp pw = index % pooled_width;
    const int_tp ph = (index / pooled_width) % pooled_height;
    const int_tp c = (index / pooled_width / pooled_height) % channels;
    const int_tp n = index / pooled_width / pooled_height / channels;
    const int_tp hstart = ph * stride_h;
    const int_tp hend = min(hstart + kernel_h, height);
    const int_tp wstart = pw * stride_w;
    const int_tp wend = min(wstart + kernel_w, width);
    // We set cumsum to be 0 to avoid divide-by-zero problems
    Dtype cumsum = FLT_MIN;
    Dtype cumvalues = 0.;
    __global const Dtype* bottom_slice = bottom_data
        + (n * channels + c) * height * width;
    // First pass: get sum
    for (int_tp h = hstart; h < hend; ++h) {
      for (int_tp w = wstart; w < wend; ++w) {
        cumsum += bottom_slice[h * width + w];
        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
      }
    }
    top_data[index] = cumvalues / cumsum;
  }
 }

 __kernel void TEMPLATE(max_pool_backward,Dtype)(const int_tp nthreads,
                                                __global const Dtype* top_diff,
                                                const int use_mask,
                                                __global const int_tp* mask,
                                                __global const Dtype* top_mask,
                                                const int_tp num,
                                                const int_tp channels,
                                                const int_tp height,
                                                const int_tp width,
                                                const int_tp pooled_height,
                                                const int_tp pooled_width,
                                                const int_tp kernel_h,
                                                const int_tp kernel_w,
                                                const int_tp stride_h,
                                                const int_tp stride_w,
                                                const int_tp pad_h,
                                                const int_tp pad_w,
                                                __global Dtype* bottom_diff) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    // find out the local index
    // find out the local offset
    const int_tp w = index % width;
    const int_tp h = (index / width) % height;
    const int_tp c = (index / width / height) % channels;
    const int_tp n = index / width / height / channels;
    const int_tp phstart =
        (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
    const int_tp phend = min((h + pad_h) / stride_h + 1, pooled_height);
    const int_tp pwstart =
        (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
    const int_tp pwend = min((w + pad_w) / stride_w + 1, pooled_width);
    Dtype gradient = 0;
    const int_tp offset = (n * channels + c) * pooled_height * pooled_width;
    __global const Dtype* top_diff_slice = top_diff + offset;
    if (use_mask == 1) {
      __global const int_tp* mask_slice = mask + offset;
      for (int_tp ph = phstart; ph < phend; ++ph) {
        for (int_tp pw = pwstart; pw < pwend; ++pw) {
          if (mask_slice[ph * pooled_width + pw] == h * width + w) {
            gradient += top_diff_slice[ph * pooled_width + pw];
          }
        }
      }
    } else {
      __global const Dtype* top_mask_slice = top_mask + offset;
      for (int_tp ph = phstart; ph < phend; ++ph) {
        for (int_tp pw = pwstart; pw < pwend; ++pw) {
          if (top_mask_slice[ph * pooled_width + pw] == h * width + w) {
            gradient += top_diff_slice[ph * pooled_width + pw];
          }
        }
      }
    }
    bottom_diff[index] = gradient;
  }
 }

 __kernel void TEMPLATE(ave_pool_backward,Dtype)(const int_tp nthreads,
                                                __global const Dtype* top_diff,
                                                const int_tp num,
                                                const int_tp channels,
                                                const int_tp height,
                                                const int_tp width,
                                                const int_tp pooled_height,
                                                const int_tp pooled_width,
                                                const int_tp kernel_h,
                                                const int_tp kernel_w,
                                                const int_tp stride_h,
                                                const int_tp stride_w,
                                                const int_tp pad_h,
                                                const int_tp pad_w,
                                                __global Dtype* bottom_diff) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    // find out the local index
    // find out the local offset
    const int_tp w = index % width + pad_w;
    const int_tp h = (index / width) % height + pad_h;
    const int_tp c = (index / width / height) % channels;
    const int_tp n = index / width / height / channels;
    const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
    const int_tp phend = min(h / stride_h + 1, pooled_height);
    const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
    const int_tp pwend = min(w / stride_w + 1, pooled_width);
    Dtype gradient = 0.0;
    __global const Dtype* const top_diff_slice = top_diff
        + (n * channels + c) * pooled_height * pooled_width;
    for (int_tp ph = phstart; ph < phend; ++ph) {
      for (int_tp pw = pwstart; pw < pwend; ++pw) {
        // figure out the pooling size
        int_tp hstart = ph * stride_h - pad_h;
        int_tp wstart = pw * stride_w - pad_w;
        int_tp hend = min(hstart + kernel_h, height + pad_h);
        int_tp wend = min(wstart + kernel_w, width + pad_w);
        int_tp pool_size = (hend - hstart) * (wend - wstart);
        gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
      }
    }
    bottom_diff[index] = gradient;
  }
 }

 __kernel void TEMPLATE(sto_pool_backward,Dtype)(
    const int_tp nthreads, __global const Dtype* rand_idx,
    __global const Dtype* const top_diff, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width,
    const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h,
    const int_tp stride_w, __global Dtype* bottom_diff) {
  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {
    // find out the local index
    // find out the local offset
    const int_tp w = index % width;
    const int_tp h = (index / width) % height;
    const int_tp c = (index / width / height) % channels;
    const int_tp n = index / width / height / channels;
    const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
    const int_tp phend = min(h / stride_h + 1, pooled_height);
    const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
    const int_tp pwend = min(w / stride_w + 1, pooled_width);
    Dtype gradient = 0.0;
    __global const Dtype* rand_idx_slice = rand_idx
        + (n * channels + c) * pooled_height * pooled_width;
    __global const Dtype* top_diff_slice = top_diff
        + (n * channels + c) * pooled_height * pooled_width;
    for (int_tp ph = phstart; ph < phend; ++ph) {
      for (int_tp pw = pwstart; pw < pwend; ++pw) {
        gradient += top_diff_slice[ph * pooled_width + pw]
            * (index == (int_tp) (rand_idx_slice[ph * pooled_width + pw])?1.0:0.0);
      }
    }
    bottom_diff[index] = gradient;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(max_pool_forward_nd, Dtype)(const int_tp n,
                                                   const int_tp num_axes,
                                                   __global const Dtype* bottom_data,
                                                   const int_tp channels,
                                                   __global const int_tp* size,
                                                   __global const int_tp* pooled_size,
                                                   __global const int_tp* kernel_size,
                                                   __global const int_tp* ext_kernel_size,
                                                   __global const int_tp* stride,
                                                   __global const int_tp* dilation,
                                                   __global const int_tp* pad,
                                                   __global Dtype* top_data,
                                                   const int use_mask,
                                                   __global int_tp* mask, __global Dtype* top_mask) {
  int_tp d_idx[6];
  int_tp d_start[6];
  int_tp d_end[6];
  int_tp d_iter[6];
  int_tp i;

  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    int_tp offset = 1;
    int_tp num = index;

    bool do_continue = false;

    for (i = num_axes - 1; i >= 0; --i) {
      d_idx[i] = num % pooled_size[i];
      d_start[i] = d_idx[i] * stride[i] - pad[i];
      d_end[i] = min(d_start[i] + ext_kernel_size[i], size[i]);
      d_start[i] = max(d_start[i], (int_tp)0);
      num /= pooled_size[i];
      offset *= size[i];
      d_iter[i] = d_start[i];

      if (d_start[i] >= d_end[i]) {
        top_data[index] = -FLT_MAX;
        if (use_mask) {
          mask[index] = -1;
        } else {
          top_mask[index] = -1;
        }
        do_continue = true;
      }
    }

    if(do_continue) {
      continue;
    }

    int_tp chan = num % channels;
    num /= channels;
    offset *= (num * channels + chan);

    Dtype maxval = -FLT_MAX;
    int_tp maxidx = -1;
    int_tp final_offset = 0;

    bool incremented;
    do {
      final_offset = offset;
      int_tp size_prod = 1;
      for (i = num_axes - 1; i >= 0; --i) {
        final_offset += d_iter[i] * size_prod;
        size_prod *= size[i];
      }

      if (bottom_data[final_offset] > maxval) {
        maxidx = final_offset;
        maxval = bottom_data[maxidx];
      }

      incremented = false;
      for (i = num_axes - 1; i >= 0; --i) {
        if (d_iter[i] >= d_end[i] - dilation[i]) {
          d_iter[i] = d_start[i];
        } else {
          d_iter[i] += dilation[i];
          incremented = true;
          break;
        }
      }
    } while (incremented);

    top_data[index] = maxval;
    if (use_mask == 1) {
      mask[index] = maxidx;
    } else {
      top_mask[index] = maxidx;
    }
  }
 }


 __kernel void TEMPLATE(max_pool_backward_nd, Dtype)(const int_tp n,
                                                    const int_tp num_axes,
                                                    __global const Dtype* top_diff,
                                                    const int use_mask,
                                                    __global const int_tp* mask,
                                                    __global const Dtype* top_mask,
                                                    const int_tp channels,
                                                    __global const int_tp* size,
                                                    __global const int_tp* pooled_size,
                                                    __global const int_tp* kernel_size,
                                                    __global const int_tp* ext_kernel_size,
                                                    __global const int_tp* stride,
                                                    __global const int_tp* dilation,
                                                    __global const int_tp* pad,
                                                    __global Dtype* bottom_diff) {
  int_tp d_idx[6];
  int_tp d_start[6];
  int_tp d_end[6];
  int_tp d_iter[6];
  int_tp i;

  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    // find out the local index
    // find out the local offset
    int_tp offset = 1;
    int_tp num = index;
    for (i = num_axes - 1; i >= 0; --i) {
      d_idx[i] = num % size[i];
      if (dilation[i] > 1) {
        d_start[i] =
            (d_idx[i] < ext_kernel_size[i]) ?
                d_idx[i] % dilation[i] : (d_idx[i] - ext_kernel_size[i]) + 1;
        d_end[i] =
            (d_idx[i] >= pooled_size[i]) ?
                (pooled_size[i] - 1)
                    - (pooled_size[i] - 1 - d_start[i]) % dilation[i] :
                d_idx[i];
      } else {
        d_start[i] =
            (d_idx[i] + pad[i] < kernel_size[i]) ?
                0 : (d_idx[i] + pad[i] - kernel_size[i]) / stride[i] + 1;
        d_end[i] = min((int_tp) ((d_idx[i] + pad[i]) / stride[i] + 1),
                       (int_tp) (pooled_size[i]));
      }
      num /= size[i];
      offset *= pooled_size[i];
      d_iter[i] = d_start[i];

      if (d_start[i] > d_end[i]) {
        bottom_diff[index] = 0;
        return;
      }
    }
    int_tp chan = num % channels;
    num /= channels;
    offset *= (num * channels + chan);

    Dtype gradient = 0;
    int_tp final_offset = 0;
    int_tp im_offset = 0;

    bool incremented;
    do {
      final_offset = offset;
      im_offset = 0;
      int_tp size_prod = 1;
      int_tp pooled_size_prod = 1;
      for (i = num_axes - 1; i >= 0; --i) {
        final_offset += d_iter[i] * pooled_size_prod;
        im_offset += d_idx[i] * size_prod;
        size_prod *= size[i];
        pooled_size_prod *= pooled_size[i];
      }

      if (use_mask) {
        if (mask[final_offset] == im_offset) {
          gradient += top_diff[final_offset];
        }
      } else {
        if (top_mask[final_offset] == im_offset) {
          gradient += top_diff[final_offset];
        }
      }

      incremented = false;
      for (i = num_axes - 1; i >= 0; --i) {
        if (d_iter[i] > d_end[i] - dilation[i]) {
          d_iter[i] = d_start[i];
        } else {
          d_iter[i] += dilation[i];
          incremented = true;
          break;
        }
      }
    } while (incremented);
    bottom_diff[index] = gradient;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int_tp nthreads,
 __global Dtype* bottom_data,
                                                  const int_tp num,
                                                  const int_tp channels,
                                                  const int_tp height,
                                                  const int_tp width,
                                                  const int_tp pooled_height,
                                                  const int_tp pooled_width,
                                                  const int_tp kernel_h,
                                                  const int_tp kernel_w,
                                                  const int_tp ext_kernel_h,
                                                  const int_tp ext_kernel_w,
                                                  const int_tp stride_h,
                                                  const int_tp stride_w,
                                                  const int_tp dilation_h,
                                                  const int_tp dilation_w,
                                                  const int_tp pad_h,
                                                  const int_tp pad_w,
                                                  __global Dtype* top_data,
                                                  const int use_mask,
                                                  __global int_tp* mask,
                                                  __global Dtype* top_mask) {
  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {
    int_tp pw = index % pooled_width;
    int_tp ph = (index / pooled_width) % pooled_height;
    int_tp c = (index / pooled_width / pooled_height) % channels;
    int_tp n = index / pooled_width / pooled_height / channels;
    int_tp hstart = ph * stride_h - pad_h;
    int_tp wstart = pw * stride_w - pad_w;
    int_tp hend = min(hstart + ext_kernel_h, height);
    int_tp wend = min(wstart + ext_kernel_w, width);
    hstart = max(hstart, (int_tp) 0);
    wstart = max(wstart, (int_tp) 0);
    Dtype maxval = -FLT_MAX;
    int_tp maxidx = -1;
    __global Dtype* bottom_data_ptr = bottom_data
        + (n * channels + c) * height * width;
    for (int_tp h = hstart; h < hend; h += dilation_h) {
      for (int_tp w = wstart; w < wend; w += dilation_w) {
        if (bottom_data_ptr[h * width + w] > maxval) {
          maxidx = h * width + w;
          maxval = bottom_data_ptr[maxidx];
        }
      }
    }
    top_data[index] = maxval;
    if (use_mask == 1) {
      mask[index] = maxidx;
    } else {
      top_mask[index] = maxidx;
    }
  }
 }

 __kernel void TEMPLATE(max_pool_backward_sk,Dtype)(
    const int_tp nthreads, __global const Dtype* top_diff, const int use_mask,
    __global const int_tp* mask, __global const Dtype* top_mask,
    const int_tp num, const int_tp channels, const int_tp height,
    const int_tp width, const int_tp pooled_height, const int_tp pooled_width,
    const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
    const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
    const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h,
    const int_tp pad_w,
    __global Dtype* bottom_diff) {

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {

    __global const int_tp* mask_ptr = mask;
    __global const Dtype* top_diff_ptr = top_diff;

 // find out the local index
 // find out the local offset
    int_tp w = index % width;
    int_tp h = (index / width) % height;
    int_tp c = (index / width / height) % channels;
    int_tp n = index / width / height / channels;

    int_tp pooled_height_1 = pooled_height - 1;
    int_tp pooled_width_1 = pooled_width - 1;
    int_tp phstart =
        (h < ext_kernel_h) ? h % dilation_h : (h - ext_kernel_h) + 1;
    int_tp phend =
        (h >= pooled_height) ?
            pooled_height_1 - (pooled_height_1 - phstart) % dilation_h : h;
    int_tp pwstart =
        (w < ext_kernel_w) ? w % dilation_w : (w - ext_kernel_w) + 1;
    int_tp pwend =
        (w >= pooled_width) ?
            pooled_width_1 - (pooled_width_1 - pwstart) % dilation_w : w;

    Dtype gradient = 0;
    int_tp offset = (n * channels + c) * pooled_height * pooled_width;
    top_diff_ptr += offset;
    if (use_mask == 1) {
      mask_ptr += offset;
      for (int_tp ph = phstart; ph <= phend; ph += dilation_h) {
        for (int_tp pw = pwstart; pw <= pwend; pw += dilation_w) {
          if (mask_ptr[ph * pooled_width + pw] == h * width + w) {
            gradient += top_diff_ptr[ph * pooled_width + pw];
          }
        }
      }
    } else {
      for (int_tp ph = phstart; ph <= phend; ph += dilation_h) {
        for (int_tp pw = pwstart; pw <= pwend; pw += dilation_w) {
          if (top_mask[ph * pooled_width + pw] == h * width + w) {
            gradient += top_diff_ptr[ph * pooled_width + pw];
          }
        }
      }
    }
    bottom_diff[index] = gradient;
  }
 }

 __kernel void TEMPLATE(ave_pool_forward_sk,Dtype)(
    const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width,
    const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
    const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
    const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h,
    const int_tp pad_w,
    __global Dtype* top_data) {

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {

    int_tp pw = index % pooled_width;
    int_tp ph = (index / pooled_width) % pooled_height;
    int_tp c = (index / pooled_width / pooled_height) % channels;
    int_tp n = index / pooled_width / pooled_height / channels;
    int_tp hstart = ph * stride_h - pad_h;
    int_tp wstart = pw * stride_w - pad_w;
    int_tp hend = min(hstart + ext_kernel_h, height + pad_h);
    int_tp wend = min(wstart + ext_kernel_w, width + pad_w);
    hstart = max(hstart, (int_tp)0);
    wstart = max(wstart, (int_tp)0);
    hend = min(hend, height);
    wend = min(wend, width);
    Dtype aveval = 0;
    __global const Dtype* bottom_data_ptr = bottom_data;
    bottom_data_ptr += (n * channels + c) * height * width;
    int_tp pool_size = 0;
    for (int_tp h = hstart; h < hend; ++h) {
      for (int_tp w = wstart; w < wend; ++w) {
        aveval += bottom_data_ptr[h * width + w];
        ++pool_size;
      }
    }
    top_data[index] = aveval / pool_size;
  }
 }

 __kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)(
    const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width,
    const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
    const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
    const int_tp dilation_h, const int_tp dilation_w, __global Dtype* rand_idx,
    __global Dtype* top_data) {

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {
    int_tp pw = index % pooled_width;
    int_tp ph = (index / pooled_width) % pooled_height;
    int_tp c = (index / pooled_width / pooled_height) % channels;
    int_tp n = index / pooled_width / pooled_height / channels;
    int_tp hstart = ph * stride_h;
    int_tp hend = min(hstart + ext_kernel_h, height);
    int_tp wstart = pw * stride_w;
    int_tp wend = min(wstart + ext_kernel_w, width);
    Dtype cumsum = 0.;
    __global const Dtype* bottom_data_ptr = bottom_data;
    bottom_data_ptr += (n * channels + c) * height * width;
    // First pass: get sum
    for (int_tp h = hstart; h < hend; h += dilation_h) {
      for (int_tp w = wstart; w < wend; w += dilation_w) {
        cumsum += bottom_data_ptr[h * width + w];
      }
    }
    float thres = rand_idx[index] * cumsum;
    // Second pass: get value, and set index.
    cumsum = 0;
    for (int_tp h = hstart; h < hend; h += dilation_h) {
      for (int_tp w = wstart; w < wend; w += dilation_w) {
        cumsum += bottom_data_ptr[h * width + w];
        if (cumsum >= thres) {
          rand_idx[index] = ((n * channels + c) * height + h) * width + w;
          top_data[index] = bottom_data_ptr[h * width + w];
          h = hend;
          w = wend;
        }
      }
    }
  }
 }

 __kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)(
    const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width,
    const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
    const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
    const int_tp dilation_h, const int_tp dilation_w,
    __global Dtype* top_data) {

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {
    int_tp pw = index % pooled_width;
    int_tp ph = (index / pooled_width) % pooled_height;
    int_tp c = (index / pooled_width / pooled_height) % channels;
    int_tp n = index / pooled_width / pooled_height / channels;
    int_tp hstart = ph * stride_h;
    int_tp hend = min(hstart + ext_kernel_h, height);
    int_tp wstart = pw * stride_w;
    int_tp wend = min(wstart + ext_kernel_w, width);
    // We set cumsum to be 0 to avoid divide-by-zero problems
    Dtype cumsum = FLT_MIN;
    Dtype cumvalues = 0.;
    __global const Dtype* bottom_data_ptr = bottom_data;
    bottom_data_ptr += (n * channels + c) * height * width;
    // First pass: get sum
    for (int_tp h = hstart; h < hend; h += dilation_h) {
      for (int_tp w = wstart; w < wend; w += dilation_w) {
        cumsum += bottom_data_ptr[h * width + w];
        cumvalues += bottom_data_ptr[h * width + w]
            * bottom_data_ptr[h * width + w];
      }
    }
    top_data[index] = cumvalues / cumsum;
  }

 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(slice,Dtype)(const int_tp nthreads,
                                    __global const Dtype* in_data,
                                    const int forward, const int_tp num_slices,
                                    const int_tp slice_size,
                                    const int_tp bottom_slice_axis,
                                    const int_tp top_slice_axis,
                                    const int_tp offset_slice_axis,
                                    __global Dtype* out_data) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp total_slice_size = slice_size * top_slice_axis;
    const int_tp slice_num = index / total_slice_size;
    const int_tp slice_index = index % total_slice_size;
    const int_tp bottom_index = slice_index
        + (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
    if (forward == 1) {
      out_data[index] = in_data[bottom_index];
    } else {
      out_data[bottom_index] = in_data[index];
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(softmax_loss_forward,Dtype)(
    int_tp n, __global const Dtype* prob_data, __global const Dtype* label,
    __global Dtype* loss,
    const int_tp num, const int_tp dim, const int_tp spatial_dim,
    const int has_ignore_label_, const int_tp ignore_label_,
    __global Dtype* counts) {

  for (int_tp index = get_global_id(0); index < n;
      index += get_global_size(0)) {
    const int_tp n = index / spatial_dim;
    const int_tp s = index % spatial_dim;
    const int_tp label_value = (int_tp) (label[n * spatial_dim + s]);
    if (has_ignore_label_ == 1 && label_value == ignore_label_) {
      loss[index] = 0;
      counts[index] = 0;
    } else {
      loss[index] = -log((Dtype)(
          max((Dtype) (prob_data[n * dim + label_value * spatial_dim + s]),
              (Dtype) FLT_MIN)));
      counts[index] = 1;
    }
  }
 }

 __kernel void TEMPLATE(softmax_loss_backward,Dtype)(const int_tp nthreads,
                                                    __global const Dtype* top,
                                                    __global const Dtype* label,
                                                    __global Dtype* bottom_diff,
                                                    const int_tp num,
                                                    const int_tp dim,
                                                    const int_tp spatial_dim,
                                                    const int has_ignore_label_,
                                                    const int_tp ignore_label_,
                                                    __global Dtype* counts) {

  const int_tp channels = dim / spatial_dim;

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {

    const int_tp n = index / spatial_dim;
    const int_tp s = index % spatial_dim;
    const int_tp label_value = (int_tp) (label[n * spatial_dim + s]);

    if (has_ignore_label_ == 1 && label_value == ignore_label_) {
      for (int_tp c = 0; c < channels; ++c) {
        bottom_diff[n * dim + c * spatial_dim + s] = 0;
      }
      counts[index] = 0;
    } else {
      bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
      counts[index] = 1;
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(ada_delta_update,Dtype)(int_tp N, __global Dtype* g,
                                               __global Dtype* h,
                                               __global Dtype* h2,
                                               Dtype momentum,
                                               Dtype delta,
                                               Dtype local_rate) {
  for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
    Dtype gi = g[i];
    Dtype hi = h[i] = momentum * h[i] + (1.0 - momentum) * gi * gi;
    gi = gi * sqrt((h2[i] + delta) / (hi + delta));
    h2[i] = momentum * h2[i] + (1.0 - momentum) * gi * gi;
    g[i] = local_rate * gi;
  }
 }

 __kernel void TEMPLATE(ada_grad_update,Dtype)(int_tp N, __global Dtype* g,
                                              __global Dtype* h,
                                              Dtype delta,
                                              Dtype local_rate) {
  for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
    Dtype gi = g[i];
    Dtype hi = h[i] = h[i] + gi * gi;
    g[i] = local_rate * gi / (sqrt(hi) + delta);
  }
 }

 __kernel void TEMPLATE(adam_update,Dtype)(int_tp N, __global Dtype* g,
                                          __global Dtype* m,
                                          __global Dtype* v,
                                          Dtype beta1,
                                          Dtype beta2,
                                          Dtype eps_hat,
                                          Dtype corrected_local_rate) {
  for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
    Dtype gi = g[i];
    Dtype mi = m[i] = m[i] * beta1 + gi * (1 - beta1);
    Dtype vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2);
    g[i] = corrected_local_rate * mi / (sqrt(vi) + eps_hat);
  }
 }


 __kernel void TEMPLATE(nesterov_update,Dtype)(int_tp N, __global Dtype* g,
                                              __global Dtype* h,
                                              Dtype momentum,
                                              Dtype local_rate) {
  for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
    Dtype hi = h[i];
    Dtype hi_new = h[i] = momentum * hi + local_rate * g[i];
    g[i] = (1 + momentum) * hi_new - momentum * hi;
  }
 }

 __kernel void TEMPLATE(rms_prop_update,Dtype)(int_tp N, __global Dtype* g,
                                              __global Dtype* h,
                                              Dtype rms_decay,
                                              Dtype delta,
                                              Dtype local_rate) {
  for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
    Dtype gi = g[i];
    Dtype hi = h[i] = rms_decay * h[i] + (1 - rms_decay) * gi * gi;
    g[i] = local_rate * g[i] / (sqrt(hi) + delta);
  }
 }

 __kernel void TEMPLATE(sgd_update,Dtype)(int_tp N, __global Dtype* g,
                                         __global Dtype* h,
                                         Dtype momentum,
                                         Dtype local_rate) {
  for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
    g[i] = h[i] = momentum * h[i] + local_rate * g[i];
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif


 __kernel void TEMPLATE(tile,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data,
                                   const int_tp tile_size, const int_tp num_tiles,
                                   const int_tp bottom_tile_axis,
                                   __global Dtype* top_data) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp d = index % tile_size;
    const int_tp b = (index / tile_size / num_tiles) % bottom_tile_axis;
    const int_tp n = index / tile_size / num_tiles / bottom_tile_axis;
    const int_tp bottom_index = (n * bottom_tile_axis + b) * tile_size + d;
    top_data[index] = bottom_data[bottom_index];
  }
 }


 __kernel void TEMPLATE(tile_backward,Dtype)(const int_tp nthreads,
                                            __global const Dtype* top_diff,
                                            const int_tp tile_size,
                                            const int_tp num_tiles,
                                            const int_tp bottom_tile_axis,
                                            __global Dtype* bottom_diff) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp d = index % tile_size;
    const int_tp b = (index / tile_size) % bottom_tile_axis;
    const int_tp n = index / tile_size / bottom_tile_axis;
    bottom_diff[index] = 0;
    int_tp top_index = (n * num_tiles * bottom_tile_axis + b) * tile_size + d;
    for (int_tp t = 0; t < num_tiles; ++t) {
      bottom_diff[index] += top_diff[top_index];
      top_index += bottom_tile_axis * tile_size;
    }
  }
 }

 #ifdef DOUBLE_SUPPORT_AVAILABLE

 #undef Dtype

 #define Dtype double

 #undef TYPE

 #define TYPE TYPE_DOUBLE

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(relu_forward,Dtype)(const int_tp n,
                                           __global const Dtype* in,
                                           __global Dtype* out,
                                           Dtype negative_slope) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;
  }
 }

 __kernel void TEMPLATE(relu_backward,Dtype)(const int_tp n,
                                            __global const Dtype* in_diff,
                                            __global const Dtype* in_data,
                                            __global Dtype* out_diff,
                                            Dtype negative_slope) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out_diff[index] = in_diff[index]
        * ((in_data[index] > 0?1.0:0.0) + (in_data[index] <= 0?1.0:0.0) * negative_slope);
  }
 }

 __kernel void TEMPLATE(tanh_forward,Dtype)(const int_tp n,
                                           __global const Dtype* in,
                                           __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out[index] = tanh(in[index]);
  }
 }

 __kernel void TEMPLATE(tanh_backward,Dtype)(const int_tp n,
                                            __global const Dtype* in_diff,
                                            __global const Dtype* out_data,
                                            __global Dtype* out_diff) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    Dtype tanhx = out_data[index];
    out_diff[index] = in_diff[index] * (1 - tanhx * tanhx);
  }
 }

 __kernel void TEMPLATE(sigmoid_forward,Dtype)(const int_tp n,
                                              __global const Dtype* in,
                                              __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out[index] = 1.0 / (1.0 + exp(-in[index]));
  }
 }

 __kernel void TEMPLATE(sigmoid_backward,Dtype)(const int_tp n,
                                               __global const Dtype* in_diff,
                                               __global const Dtype* out_data,
                                               __global Dtype* out_diff) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    const Dtype sigmoid_x = out_data[index];
    out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
  }
 }

 __kernel void TEMPLATE(threshold,Dtype)(const int_tp n, const Dtype threshold,
                                        __global const Dtype* in,
                                        __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out[index] = in[index] > threshold ? 1.0 : 0.0;
  }
 }

 __kernel void TEMPLATE(prelu_forward,Dtype)(const int_tp n, const int_tp channels,
                                            const int_tp dim,
                                            __global const Dtype* in,
                                            __global Dtype* out,
                                            __global const Dtype* slope_data,
                                            const int_tp div_factor) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    int_tp c = (index / dim) % channels / div_factor;
    out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
  }
 }

 __kernel void TEMPLATE(prelu_backward,Dtype)(const int_tp n, const int_tp channels,
                                             const int_tp dim,
                                             __global const Dtype* in_diff,
                                             __global const Dtype* in_data,
                                             __global Dtype* out_diff,
                                             __global const Dtype* slope_data,
                                             const int_tp div_factor) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    int_tp c = (index / dim) % channels / div_factor;
    out_diff[index] = in_diff[index]
        * ((in_data[index] > 0?1.0:0.0) + (in_data[index] <= 0?1.0:0.0) * slope_data[c]);
  }
 }

 __kernel void TEMPLATE(prelu_param_backward,Dtype)(const int_tp n, const int_tp rows,
                                                   const int_tp rowPitch,
                                                   __global const Dtype* in_diff,
                                                   __global const Dtype* in_data,
                                                   __global Dtype* out_diff) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0?1.0:0.0);
    for (int k = 1; k < rows; k++) {
      out_diff[index] += in_diff[index + k * rowPitch]
          * in_data[index + k * rowPitch]
          * (in_data[index + k * rowPitch] <= 0?1.0:0.0);
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(gpu_set,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* y) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[index] = alpha;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(br_forward,Dtype)(const int_tp count, const int_tp inner_dim,
                                         __global const Dtype* in,
                                         __global const Dtype* permut,
                                         __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < count;
      index += get_global_size(0)) {
    int_tp n = index / (inner_dim);
    int_tp in_n = (int_tp) (permut[n]);
    out[index] = in[in_n * (inner_dim) + index % (inner_dim)];
  }
 }

 __kernel void TEMPLATE(br_backward,Dtype)(const int_tp count, const int_tp inner_dim,
                                          __global const Dtype* in,
                                          __global const Dtype* top_indexes,
                                          __global const Dtype* begins,
                                          __global const Dtype* counts,
                                          __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < count;
      index += get_global_size(0)) {
    int_tp n = index / (inner_dim);
    out[index] = 0;
    int_tp lower = (int_tp) (begins[n]);
    int_tp upper = lower + (int_tp) (counts[n]);
    for (int_tp i = lower; i < upper; ++i) {
      int_tp in_n = (int_tp) (top_indexes[i]);
      out[index] += in[in_n * (inner_dim) + index % (inner_dim)];
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(null_kernel,Dtype)(Dtype arg) {
  Dtype out = arg;
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(bias_forward,Dtype)(const int_tp n,
                                           __global const Dtype* in,
                                           __global const Dtype* bias,
                                           const int_tp bias_dim,
                                           const int_tp inner_dim,
                                           __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n;
      index += get_global_size(0)) {
    const int_tp bias_index = (index / inner_dim) % bias_dim;
    out[index] = in[index] + bias[bias_index];
  }
 }

 __kernel void TEMPLATE(scale_forward,Dtype)(const int_tp n,
                                            __global const Dtype* in,
                                            __global const Dtype* scale,
                                            const int_tp scale_dim,
                                            const int_tp inner_dim,
                                            __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n;
      index += get_global_size(0)) {
    const int_tp scale_index = (index / inner_dim) % scale_dim;
    out[index] = in[index] * scale[scale_index];
  }
 }

 __kernel void TEMPLATE(scale_bias_forward,Dtype)(const int_tp n,
                                                 __global const Dtype* in,
                                                 __global const Dtype* scale,
                                                 __global const Dtype* bias,
                                                 const int_tp scale_dim,
                                                 const int_tp inner_dim,
                                                 __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n;
      index += get_global_size(0)) {
    const int_tp scale_index = (index / inner_dim) % scale_dim;
    out[index] = in[index] * scale[scale_index] + bias[scale_index];
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(bnll_forward,Dtype)(const int_tp n,
                                           __global const Dtype* in,
                                           __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    if (in[index] > 0.0f) {
      out[index] = in[index] + log((Dtype) (1.0 + exp(-in[index])));
    } else {
      out[index] = log((Dtype) (1.0 + exp(in[index])));
    }
  }
 }

 __kernel void TEMPLATE(bnll_backward,Dtype)(const int_tp n,
                                            __global const Dtype* in_diff,
                                            __global const Dtype* in_data,
                                            __global Dtype* out_diff) {
  Dtype kBNLL_THRESHOLD = 50.;
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    Dtype expval = exp(min(in_data[index], kBNLL_THRESHOLD));
    out_diff[index] = in_diff[index] * expval / (expval + 1.);
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(kernel_channel_max,Dtype)(const int_tp num, const int_tp channels,
                                   const int_tp spatial_dim,
                                   __global const Dtype* data,
                                   __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < num * spatial_dim; index +=
      get_global_size(0)) {
    int_tp n = index / spatial_dim;
    int_tp s = index % spatial_dim;
    float maxval = -FLT_MAX;
    for (int_tp c = 0; c < channels; ++c) {
      maxval = max((Dtype)(data[(n * channels + c) * spatial_dim + s]), (Dtype)maxval);
    }
    out[index] = maxval;
  }
 }

 __kernel void TEMPLATE(kernel_channel_subtract,Dtype)(const int_tp count, const int_tp num,
                                        const int_tp channels,
                                        const int_tp spatial_dim,
                                        __global const Dtype* channel_max,
                                        __global Dtype* data) {
  for (int_tp index = get_global_id(0); index < count;
      index += get_global_size(0)) {
    int_tp n = index / channels / spatial_dim;
    int_tp s = index % spatial_dim;
    data[index] -= channel_max[n * spatial_dim + s];
  }
 }

 __kernel void TEMPLATE(kernel_exp,Dtype)(const int_tp count, __global const Dtype* data,
                           __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < count;
      index += get_global_size(0)) {
    out[index] = exp(data[index]);
  }
 }

 __kernel void TEMPLATE(kernel_channel_sum,Dtype)(const int_tp num, const int_tp channels,
                                   const int_tp spatial_dim,
                                   __global const Dtype* data,
                                   __global Dtype* channel_sum) {
  for (int_tp index = get_global_id(0); index < num * spatial_dim; index +=
      get_global_size(0)) {
    int_tp n = index / spatial_dim;
    int_tp s = index % spatial_dim;
    Dtype sum = 0;
    for (int_tp c = 0; c < channels; ++c) {
      sum += data[(n * channels + c) * spatial_dim + s];
    }
    channel_sum[index] = sum;
  }
 }

 __kernel void TEMPLATE(kernel_channel_div,Dtype)(const int_tp count, const int_tp num,
                                   const int_tp channels, const int_tp spatial_dim,
                                   __global const Dtype* channel_sum,
                                   __global Dtype* data) {
  for (int_tp index = get_global_id(0); index < count;
      index += get_global_size(0)) {
    int_tp n = index / channels / spatial_dim;
    int_tp s = index % spatial_dim;
    data[index] /= channel_sum[n * spatial_dim + s];
  }
 }

 __kernel void TEMPLATE(kernel_channel_dot,Dtype)(const int_tp num, const int_tp channels,
                                   const int_tp spatial_dim,
                                   __global const Dtype* data_1,
                                   __global const Dtype* data_2,
                                   __global Dtype* channel_dot) {
  for (int_tp index = get_global_id(0); index < num * spatial_dim; index +=
      get_global_size(0)) {
    int_tp n = index / spatial_dim;
    int_tp s = index % spatial_dim;
    Dtype dot = 0;
    for (int_tp c = 0; c < channels; ++c) {
      dot += (data_1[(n * channels + c) * spatial_dim + s]
          * data_2[(n * channels + c) * spatial_dim + s]);
    }
    channel_dot[index] = dot;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(concat,Dtype)(const int_tp nthreads, __global const Dtype* in_data,
                                     const int forward, const int_tp num_concats,
                                     const int_tp concat_size,
                                     const int_tp top_concat_axis,
                                     const int_tp bottom_concat_axis,
                                     const int_tp offset_concat_axis,
                                     __global Dtype* out_data) {

  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp total_concat_size = concat_size * bottom_concat_axis;
    const int_tp concat_num = index / total_concat_size;
    const int_tp concat_index = index % total_concat_size;
    const int_tp top_index = concat_index
        + (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
    if (forward == 1) {
      out_data[top_index] = in_data[index];
    } else {
      out_data[index] = in_data[top_index];
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(cll_backward,Dtype)(const int_tp count, const int_tp channels,
                            const Dtype margin, const Dtype alpha, __global const Dtype* y,
                            __global const Dtype* diff, __global const Dtype* dist_sq,
                            __global Dtype *bottom_diff) {
  for (int_tp i = get_global_id(0); i < count;
      i += get_global_size(0)) {
    int_tp n = i / channels;  // the num index, to access y and dist_sq
    if (trunc(y[n]) != 0.) {  // similar pairs
      bottom_diff[i] = alpha * diff[i];
    } else {  // dissimilar pairs
      Dtype mdist = 0.;
      Dtype beta = 0.;
      Dtype dist = sqrt(dist_sq[n]);
      mdist = (margin - dist);
      beta = -alpha * mdist / (dist + 1e-4) * diff[i];
      if (mdist > 0.) {
        bottom_diff[i] = beta;
      } else {
        bottom_diff[i] = 0;
      }
    }
  }
 }

 __kernel void TEMPLATE(cll_backward_legacy,Dtype)(const int count, const int channels,
    const Dtype margin, const Dtype alpha, __global Dtype* y,
    __global Dtype* diff, __global Dtype* dist_sq,
    __global Dtype* bottom_diff) {
    for (int_tp i = get_global_id(0); i < count;
      i += get_global_size(0)) {
    int n = i / channels;  // the num index, to access y and dist_sq
    if (trunc(y[n]) != 0.) {  // similar pairs
      bottom_diff[i] = alpha * diff[i];
    } else {  // dissimilar pairs
      Dtype mdist = 0.;
      Dtype beta = 0.;
      mdist = (margin - dist_sq[n]);
      beta = -alpha;
      if (mdist > 0.) {
        bottom_diff[i] = beta;
      } else {
        bottom_diff[i] = 0;
      }
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(conv_layer_spatial_phony,Dtype)(Dtype arg) {
  Dtype out = arg;
 }

 #define __CAT(x, y) x##y
 #define CAT(x, y) __CAT(x, y)
 #define LOOP1(VAR, STMT) (STMT); (VAR)++;
 #define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;
 #define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;
 #define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;
 #define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;
 #define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;
 #define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;
 #define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;
 #define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;
 #define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;
 #define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;
 #define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;
 #define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;
 #define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++;
 #define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++;
 #define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++;
 #define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))

 #ifdef MULTI
 __kernel void CFMulti(__global Dtype* image_data, int_tp image_offset,
    __global Dtype* kernel_data, int_tp kernel_offset,
    __global Dtype* bias,const int_tp bias_offset,
    __global Dtype* convolved_image,const int_tp convolved_image_offset,
    const ushort WIDTH,
    const ushort HEIGHT,
    const ushort OUTPUT_W,
    const ushort OUTPUT_H) {

  const int_tp outputX = get_global_id(0);
  const int_tp outputY = get_global_id(1);
  const int_tp kernelNum = get_global_id(2)*ZPAR;
  if(outputX < OUTPUT_W && outputY < OUTPUT_H)
  {
    Dtype sum[ZPAR];
    Dtype4 vectorSum[ZPAR];
    for(int_tp kern =0; kern < ZPAR; kern++)
    {
      sum[kern] = 0.0f;
      vectorSum[kern] = (0.0f,0.0f,0.0f,0.0f);
    }

    const int_tp currentKernelOffset = kernel_offset + kernelNum*KERNEL_H*KERNEL_W*CHANNELS;
    const int_tp biasIndex=bias_offset + kernelNum;
    const int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W;
    const int_tp imageSize = WIDTH*HEIGHT;
    const int_tp float4Reads = KERNEL_W / 4;
    const int_tp floatReads = KERNEL_W % 4;
    Dtype4 imageCache;

    __global Dtype* image_dataPtrFloat = (image_data + (image_offset + local_image_offset));
    __global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));

    for(int_tp c = 0; c < CHANNELS; c++)
    {
      for(int_tp y = 0; y < KERNEL_H; y++)
      {

        for(int_tp x=0; x< float4Reads; x++)
        {
          imageCache = ((__global Dtype4*)image_dataPtrFloat)[x];
          for(int_tp kern =0; kern < ZPAR; kern++)
          {
            vectorSum[kern] += imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[x];
          }
        }

        if(floatReads == 1)
        {
          imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads];
          for(int_tp kern =0; kern < ZPAR; kern++)
          vectorSum[kern].s0 += ( imageCache * ( (__global Dtype4*) &(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]) )[float4Reads] ).s0;
        }
        else if(floatReads == 2)
        {
          imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads];
          for(int_tp kern =0; kern < ZPAR; kern++)
          vectorSum[kern].s01 += (imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[float4Reads]).s01;
        }
        else if(floatReads == 3)
        {
          imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads];
          for(int_tp kern =0; kern < ZPAR; kern++)
          vectorSum[kern].s012 += (imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[float4Reads]).s012;
        }

        image_dataPtrFloat += WIDTH;
        kernel_dataPtrFloat += KERNEL_W;
      }
      image_dataPtrFloat += imageSize - WIDTH*KERNEL_H;
    }
    for(int_tp kern =0; kern < ZPAR; kern++)
    sum[kern] = vectorSum[kern].x + vectorSum[kern].y + vectorSum[kern].z + vectorSum[kern].w;

    if(APPLY_BIAS == 1)
    {
      for(int_tp kern = 0; kern < ZPAR; kern++)
      if(kernelNum+kern < OUTPUT_Z)
      convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX] =
      sum[kern] + bias[biasIndex +kern];
    }
    else
    for(int_tp kern = 0; kern < ZPAR; kern++)
    if(kernelNum+kern < OUTPUT_Z)
    convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX] = sum[kern];
  }
 }

 #endif


 #ifdef MULTI_11
 __kernel void CFMulti_11_11_4(__global Dtype* image_data, int_tp image_offset,
    __global Dtype* kernel_data, int_tp kernel_offset,
    __global Dtype* bias,const int_tp bias_offset,
    __global Dtype* convolved_image,const int_tp convolved_image_offset,
    const ushort WIDTH,
    const ushort HEIGHT,
    const ushort OUTPUT_W,
    const ushort OUTPUT_H) {

  int_tp outputX = get_global_id(0)*XPAR;
  int_tp outputY = get_global_id(1)*YPAR;
  int_tp kernelNum = get_global_id(2)*ZPAR;
  if(outputX < OUTPUT_W && outputY < OUTPUT_H)
  {
    Dtype sum[XPAR*YPAR*ZPAR];
    for(int_tp kern =0; kern < XPAR*YPAR*ZPAR; kern++)
    {
      sum[kern] = 0.0f;
    }

    int_tp currentKernelOffset = kernel_offset + kernelNum*KERNELSIZE*CHANNELS;
    int_tp biasIndex=bias_offset + kernelNum;
    int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W;
    int_tp imageSize = WIDTH*HEIGHT;
    int_tp index;

    __global Dtype* image_dataPtrFloat = (image_data + (image_offset + local_image_offset));
    __global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));

    Dtype16 imageCache;
    Dtype8 imageCacheR;
    Dtype8 kernelCache;
    Dtype4 kernelCacheR;

    for(int_tp c = 0; c < CHANNELS; c++)
    {
      for(int_tp y = 0; y < 11; y++)
      {
        imageCache = ((__global Dtype16*)image_dataPtrFloat)[0];
        imageCacheR =((__global Dtype8*)image_dataPtrFloat)[2];

        for(int_tp kern =0; kern < ZPAR; kern++)
        {
          kernelCache = ((__global Dtype8*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[0];
          kernelCacheR = ((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[2];

          index = kern*XPAR;
          sum[index + 0] += dot(imageCache.S0123,kernelCache.S0123);
          sum[index + 1] += dot(imageCache.S4567,kernelCache.S0123);
          sum[index + 2] += dot(imageCache.S89AB,kernelCache.S0123);
          sum[index + 3] += dot(imageCache.SCDEF,kernelCache.S0123);

          sum[index + 0] += dot(imageCache.S4567,kernelCache.S4567);
          sum[index + 1] += dot(imageCache.S89AB,kernelCache.S4567);
          sum[index + 2] += dot(imageCache.SCDEF,kernelCache.S4567);
          sum[index + 3] += dot(imageCacheR.S0123,kernelCache.S4567);

          sum[index + 0] += dot(imageCache.S89A,kernelCacheR.S012);
          sum[index + 1] += dot(imageCache.SCDE,kernelCacheR.S012);
          sum[index + 2] += dot(imageCacheR.S012,kernelCacheR.S012);
          sum[index + 3] += dot(imageCacheR.S456,kernelCacheR.S012);
        }

        image_dataPtrFloat += WIDTH;
        kernel_dataPtrFloat += KERNEL_W;
      }
      image_dataPtrFloat += imageSize - WIDTH*KERNEL_H;
    }

    if(APPLY_BIAS == 1)
    {
      for(int_tp kern = 0; kern < ZPAR; kern++)
      {
        for(int_tp wi =0; wi < XPAR; wi++)
        if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W)
        convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX + wi] =
        sum[kern*XPAR + wi] + bias[biasIndex +kern];
      }
    }
    else
    for(int_tp kern = 0; kern < ZPAR; kern++)
    for(int_tp wi =0; wi < XPAR; wi++)
    if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W)
    convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX + wi] = sum[kern*XPAR + wi];
  }
 }

 #endif

 #ifdef MULTI_GEN
 __kernel void CFMulti_6(__global const Dtype* restrict image_data, const int_tp image_offset,
    __global const Dtype* restrict kernel_data, const int_tp kernel_offset,
    __global const Dtype* restrict bias,const int_tp bias_offset,
    __global Dtype* restrict convolved_image,const int_tp convolved_image_offset,
    const ushort WIDTH,
    const ushort HEIGHT,
    const ushort OUTPUT_W,
    const ushort OUTPUT_H) {

  const int_tp outputX = get_global_id(0)*XPAR;
  const int_tp outputY = get_global_id(1)*YPAR;
  const int_tp kernelNum = get_global_id(2)*ZPAR;

  if(outputX < OUTPUT_W && outputY < OUTPUT_H)
  {
    Dtype sum[XPAR*YPAR*ZPAR];
    for(uint_tp kern = 0; kern < XPAR*YPAR*ZPAR; kern++)
    sum[kern] = 0.0f;

    const int_tp currentKernelOffset = kernel_offset + kernelNum*KERNELSIZE*CHANNELS;
    const int_tp biasIndex=bias_offset + kernelNum;
    const int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W;
    const int_tp imageSize = WIDTH*HEIGHT;
    int_tp index;

    __global const Dtype* image_dataPtrFloat[2];
    image_dataPtrFloat[0] = (image_data + (image_offset + local_image_offset));
    image_dataPtrFloat[1] = image_dataPtrFloat[0];
    __global const Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));

    DTImage imageCache[YPAR];
    DTKernel kernelCache;
    Dtype4 temp;

    for(uint_tp c = 0; c < CHANNELS; c++)
    {
      imageCache[0] = ((__global DTImage*)image_dataPtrFloat[1])[0];
      for(uint_tp preload = 1; preload < YPAR; preload++)
      {
        image_dataPtrFloat[1] += WIDTH;
        imageCache[preload] = ((__global DTImage*)image_dataPtrFloat[1])[0];
      }

      int_tp y =0;
      LOOP(KERNEL_H, y,
          {
            int_tp kern=0;
            LOOP(ZPAR, kern,
                {
                  kernelCache = ((__global DTKernel*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[0];
                  index = kern*XPAR*YPAR;

                  for(uint_tp y_par = 0; y_par < YPAR; y_par++)
                  {
                    temp = floatDotV4(imageCache[y_par],kernelCache);
                    sum[index + y_par*XPAR + 0] += temp.s0;
                    sum[index + y_par*XPAR + 1] += temp.s1;
                    sum[index + y_par*XPAR + 2] += temp.s2;
                    sum[index + y_par*XPAR + 3] += temp.s3;
                  }
                });

            kernel_dataPtrFloat += KERNEL_W;

            for(uint_tp rotateData = 0; rotateData < YPAR - 1; rotateData++)
            imageCache[rotateData] = imageCache[rotateData + 1];

            image_dataPtrFloat[1] += WIDTH;
            imageCache[YPAR - 1] = ((__global DTImage*)image_dataPtrFloat[1])[0];
          });

      image_dataPtrFloat[0] += imageSize;
      image_dataPtrFloat[1] = image_dataPtrFloat[0];
    }

    if(APPLY_BIAS == 1)
    {
      for(uint_tp kern = 0; kern < ZPAR; kern++)
      {
        for(uint_tp hi =0; hi < YPAR; hi++)
        for(uint_tp wi =0; wi < XPAR; wi++)
        if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W && outputY + hi < OUTPUT_H)
        convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + (outputY +hi)*OUTPUT_W + outputX + wi] =
        sum[kern*XPAR*YPAR + XPAR*hi + wi] + bias[biasIndex +kern];
      }
    }
    else
    for(uint_tp kern = 0; kern < ZPAR; kern++)
    for(uint_tp hi =0; hi < YPAR; hi++)
    for(uint_tp wi =0; wi < XPAR; wi++)
    if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W && outputY + hi < OUTPUT_H)
    convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + (outputY + hi)*OUTPUT_W + outputX + wi] = sum[kern*XPAR*YPAR +XPAR*hi +wi];
  }
 }
 #endif

 //Begin IDLF kernels below here
 #ifdef IDLF

 #define activation_function(x) (x)

 #if 0
 #define _IW INPUT_WIDTH
 #define _IH INPUT_HEIGHT
 #define _OW OUTPUT_WIDTH
 #define _OH OUTPUT_HEIGHT
 #endif

 #define _ID INPUT_DEPTH
 #define _OD NUM_FILTERS

 #define FILTER_DEPTH INPUT_DEPTH
 #define NUM_INPUT INPUT_DEPTH
 #define NUM_OUTPUT NUM_FILTERS

 #define KERNEL FILTER_WIDTH
 // convolution stride, same for x and y
 #define K_STRIDE STRIDEX

 #ifndef IWPAD
 #define IWPAD 0
 #endif

 #ifndef IHPAD
 #define IHPAD 0
 #endif

 #define OUT_BLOCK_SIZE (OUT_BLOCK_WIDTH*OUT_BLOCK_HEIGHT)

 #ifndef MASTER_OUT_BLOCK_WIDTH
 #define MASTER_OUT_BLOCK_WIDTH OUT_BLOCK_WIDTH
 #endif
 #ifndef MASTER_OUT_BLOCK_HEIGHT
 #define MASTER_OUT_BLOCK_HEIGHT OUT_BLOCK_HEIGHT
 #endif

 // Each work-item computes a 4x6 region of one output map.
 // Each work-group (which will be mapped to 1 SIMD16 EU thread) will compute 16 different feature maps, but each feature map is for the same 4x6 region of the imput image.
 // NDRange:  (_OW+pad)/ OUT_BLOCK_WIDTH, (_OH+pad)/OUT_BLOCK_HEIGHT, _OD/OUT_BLOCK_DEPTH

 //#define SIMD_SIZE 16
 // NOTE: this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break.
 #ifdef SIMD16


 #define TILE_X ((OUT_BLOCK_WIDTH - 1) * STRIDEX + KERNEL)
 #define TILE_Y ((OUT_BLOCK_HEIGHT - 1) * STRIDEY + KERNEL)

 #if (TILE_X % 4) != 0
 __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
 kernel void
 convolve_simd16(  // __global float *inputs, __global float* weights, __global float* outputs
    __global float* inputs_base,
    filter_qualifier float* weights_base,
    __global float* biases_base,
    __global float* outputs_base,
    const ushort _IW,
    const ushort _IH,
    const ushort _OW,
    const ushort _OH)
 {
  __global float* outputs = outputs_base;
  __global float* inputs = inputs_base;
  filter_qualifier float* weights = weights_base;
  __global float* biases = biases_base;

  uint_tp oc = get_global_id(0) * MASTER_OUT_BLOCK_WIDTH;  // oc = Output Column
  uint_tp or = get_global_id(1) * MASTER_OUT_BLOCK_HEIGHT;// or = Output Row
  uint_tp fm = get_global_id(2);// fm = Feature Map = od = Output Depth
  uint_tp fmg = get_group_id(2);
  uint_tp lid = get_local_id(2);

  float in[IN_BUFFER_SIZE];// load 11x16 block of input data, really only need 11x15 for 4x6 outputs, but keep it simple.
  //float out[24]; // 4x6 block of outputs that is SIMD_SIZE deep (along the Feature Map dimension).
  float out[OUT_BLOCK_SIZE];

  uint_tp in_addr;

  // find weights adress of given neuron (lid is index)
  uint_tp weight_addr = (fmg % (_OD/SIMD_SIZE)) * INPUT_DEPTH * KERNEL * KERNEL * SIMD_SIZE + lid;

  for(int_tp i=0;i<OUT_BLOCK_SIZE;i++) {
    out[i]=0.0f;
  }

  uint_tp num_in_batch = fm / _OD;

  uint_tp input_batch_offset = num_in_batch * (_IH + IHPAD) * (_IW + IWPAD) * TOTAL_INPUT_DEPTH_SIZE;
  for(int_tp kd = 0; kd < _ID; kd++)
  {
    in_addr = input_batch_offset + (kd + INPUT_START_Z) * (_IH + IHPAD) * (_IW + IWPAD) + (or*K_STRIDE + INPUT_START_Y) * (_IW + IWPAD) + (oc*K_STRIDE + INPUT_START_X) + lid;

    // read 11x16 input block into registers.
    for(uint_tp reg = 0; reg < IN_BUFFER_SIZE; reg++) {
      in[reg] = inputs[in_addr];    // read 16 elements
      in_addr += (_IW + IWPAD);// move to next row down
    }

 // PREF could be 4 or 8, could not be other values.
 #define WEIGHT_PREF 8
    union {
      float w[WEIGHT_PREF];
      uint8 ui8;
    } weight_buf;
    int_tp w_idx=0;

    weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
    uint_tp orig_weight_addr = weight_addr;
    weight_addr += SIMD_SIZE * WEIGHT_PREF;

    int_tp kr = 0;  // kr = Kernel Row
    LOOP(KERNEL, kr,// LOOP is a macro that unrolls the loop.
        {
          int_tp kc = 0;  // kc = Kernel Column
          LOOP(KERNEL, kc,
              {
                for(int_tp br=0; br < OUT_BLOCK_HEIGHT; br++) {
                  for(int_tp bc=0; bc < OUT_BLOCK_WIDTH; bc++) {
                    float input = intel_sub_group_shuffle( in[br * K_STRIDE + kr], bc * K_STRIDE + kc);
                    out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
                  }
                }
                // We assume KERNEL_W is equal to KERNEL_H here.
                if ((w_idx + 1) % WEIGHT_PREF == 0
                    #if KERNEL*KERNEL % 8 != 0
                    && ((w_idx + 1) <= (KERNEL * KERNEL - WEIGHT_PREF))
                    #endif
                    ) {
                  weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
                  weight_addr += SIMD_SIZE * WEIGHT_PREF;  // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
                }
              #if KERNEL*KERNEL % 8 == 0
                // need to do nothing
              #else
                else if ((w_idx + 1) %  WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL * KERNEL - WEIGHT_PREF)))
                #if KERNEL*KERNEL % 8 == 1
                  weight_buf.w[0] = weights[weight_addr];
                #elif KERNEL*KERNEL % 4 == 0
                  weight_buf.ui8.s0123 = intel_sub_group_block_read4((__global uint *)&weights[weight_addr]);
                #else
                // should never be here if kernel_w equal to kernel_h. just in case.
                #error unsupported kernel size.
                #endif
              #endif
                ++w_idx;
              });
        });
    weight_addr = orig_weight_addr + KERNEL * KERNEL * SIMD_SIZE;

  }

 #ifdef IMAGE_AS_OUTPUT
  // TODO: no ULT for that one yet!
  uint_tp out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD)) * (_OW + OWPAD) * (_OH + OHPAD);// out_addr indexes into start of 16 feature maps.
 #else
  // we need this address calculation for outputs because we support views and batching
  uint_tp out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD) ) * (_OW + OWPAD) * (_OH + OHPAD);
 #endif

  out_addr += or * (_OW + OWPAD) + oc;  // offset for the 4x3 block that this workitem is working on;

  // we need this address calculation for biases because we support views and batching
  float bias = biases[(fm) % _OD ];
 #ifndef WRITE_PADDED_VALUES
  if(get_global_id(0) != (get_global_size(0)-1) &&
      get_global_id(1) != (get_global_size(1)-1) )
  {
 #endif
    for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
        // this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
 #ifdef IMAGE_AS_OUTPUT
        write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
 #else
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
 #endif
      }
    }
 #ifndef WRITE_PADDED_VALUES
  } else if ( get_global_id(1) != (get_global_size(1)-1) )
  {
    for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
 #ifdef IMAGE_AS_OUTPUT
        write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
 #else
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
 #endif
      }
    }
  }
  else if ( get_global_id(0) != (get_global_size(0)-1) )
  {
    for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
 #ifdef IMAGE_AS_OUTPUT
        write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
 #else
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
 #endif
      }
    }
  }
  else
  {
    for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
 #ifdef IMAGE_AS_OUTPUT
        write_imagef(outputs,(int2)(c,r*(_OW + OWPAD)),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
 #else
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
 #endif
      }
    }
  }
 #endif //#ifndef WRITE_PADDED_VALUES
 }
 #endif

 #if TILE_X % 4 == 0
 #define TILE_Y_STRIDE (64 / TILE_X)
 #define INVEC_NUM ((TILE_Y + TILE_Y_STRIDE - 1) / TILE_Y_STRIDE)
 __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
 kernel void
 convolve_simd16(  // __global float *inputs, __global float* weights, __global float* outputs
    __global float* inputs_base,
    filter_qualifier float* weights_base,
    __global float* biases_base,
    __global float* outputs_base,
    const ushort _IW,
    const ushort _IH,
    const ushort _OW,
    const ushort _OH)
 {
  __global float* outputs = outputs_base;
  __global float* inputs = inputs_base;
  filter_qualifier float* weights = weights_base;
  __global float* biases = biases_base;

  uint_tp oc = get_global_id(0) * MASTER_OUT_BLOCK_WIDTH;  // oc = Output Column
  uint_tp or = get_global_id(1) * MASTER_OUT_BLOCK_HEIGHT;// or = Output Row
  uint_tp fm = get_global_id(2);// fm = Feature Map = od = Output Depth
  uint_tp fmg = get_group_id(2);
  uint_tp lid = get_local_id(2);

  float out[OUT_BLOCK_SIZE];

  uint_tp in_addr;

  // find weights adress of given neuron (lid is index)
  uint_tp weight_addr = (fmg % (_OD/SIMD_SIZE)) * INPUT_DEPTH * KERNEL * KERNEL * SIMD_SIZE + lid;

  for(int_tp i=0;i<OUT_BLOCK_SIZE;i++) {
    out[i]=0.0f;
  }

  uint_tp num_in_batch = ( fm ) / _OD;

  uint_tp input_batch_offset = num_in_batch * (_IH + IHPAD) * (_IW + IWPAD) * TOTAL_INPUT_DEPTH_SIZE;

  in_addr = input_batch_offset + INPUT_START_Z * (_IH + IHPAD) * (_IW + IWPAD) + (or*STRIDEY + INPUT_START_Y) * (_IW + IWPAD) + (oc*STRIDEX + INPUT_START_X)
            + ( lid / ( TILE_X / 4 ) ) * (_IW + IWPAD) * STRIDEY             // y tile offset
            + ( lid % ( TILE_X / 4 ) ) * 4 * STRIDEX;                        // x tile offset

  for(int_tp kd = 0; kd < _ID; kd++)
  {
    union {
      float4 in_vec[INVEC_NUM];
      float in_array[INVEC_NUM * 4];
    } in_buf;
    uint_tp in_offset = in_addr;
    int_tp reg = 0;
 #if INVEC_NUM == 1
    LOOP(1, reg,
 #elif INVEC_NUM == 2
    LOOP(2, reg,
 #elif INVEC_NUM == 3
    LOOP(3, reg,
 #elif INVEC_NUM == 4
    LOOP(4, reg,
 #else
    #error too large invec_num.
 #endif
      {
        in_buf.in_vec[reg] = *(global float4*)(inputs + in_offset);    // read 16 elements
        in_offset += (_IW + IWPAD) * TILE_Y_STRIDE;
      });
    in_addr += (_IH + IHPAD) * (_IW + IWPAD);
    
 // PREF could be 4 or 8, could not be other values.
 #define WEIGHT_PREF 8
    union {
      float w[WEIGHT_PREF];
      uint8 ui8;
    } weight_buf;
    int_tp w_idx=0;

    weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
    uint_tp orig_weight_addr = weight_addr;
    weight_addr += SIMD_SIZE * WEIGHT_PREF;

 #define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4))

    int_tp kr = 0;  // kr = Kernel Row
    LOOP(KERNEL, kr,// LOOP is a macro that unrolls the loop.
        {
          int_tp kc = 0;  // kc = Kernel Column
          LOOP(KERNEL, kc,
              {
                for(int_tp br=0; br < OUT_BLOCK_HEIGHT; br++) {
                  for(int_tp bc=0; bc < OUT_BLOCK_WIDTH; bc++) {
                    float input = BLOCK_IN((br * STRIDEY + kr) * TILE_X + bc * STRIDEX + kc);//intel_sub_group_shuffle( in[br * K_STRIDE + kr], bc * K_STRIDE + kc);
                    out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
                  }
                }
                // We assume KERNEL_W is equal to KERNEL_H here.
                if ((w_idx + 1) % WEIGHT_PREF == 0
                #if KERNEL*KERNEL % 8 != 0
                && ((w_idx + 1) <= (KERNEL * KERNEL - WEIGHT_PREF))
                #endif
                    ) {
                  weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
                  weight_addr += SIMD_SIZE * WEIGHT_PREF;  // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
                }
              #if KERNEL*KERNEL % 8 == 0
                // need to do nothing
              #else
                else if ((w_idx + 1) %  WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL * KERNEL - WEIGHT_PREF)))
                #if KERNEL*KERNEL % 8 == 1
                  weight_buf.w[0] = weights[weight_addr];
                #elif KERNEL*KERNEL % 4 == 0
                  weight_buf.ui8.s0123 = intel_sub_group_block_read4((__global uint *)&weights[weight_addr]);
                #else
                // should never be here if kernel_w equal to kernel_h. just in case.
                #error unsupported kernel size.
                #endif
              #endif
                ++w_idx;
              });
        });
    weight_addr = orig_weight_addr + KERNEL * KERNEL * SIMD_SIZE;

  }

  // we need this address calculation for outputs because we support views and batching
  uint_tp out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD) ) * (_OW + OWPAD) * (_OH + OHPAD);

  out_addr += or * (_OW + OWPAD) + oc;  // offset for the 4x3 block that this workitem is working on;

  // we need this address calculation for biases because we support views and batching
  float bias = biases[(fm) % _OD ];
 #ifndef WRITE_PADDED_VALUES
  if(get_global_id(0) != (get_global_size(0)-1) &&
      get_global_id(1) != (get_global_size(1)-1) )
  {
 #endif
    for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
        // this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
      }
    }
 #ifndef WRITE_PADDED_VALUES
  } else if ( get_global_id(1) != (get_global_size(1)-1) )
  {
    for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
      }
    }
  }
  else if ( get_global_id(0) != (get_global_size(0)-1) )
  {
    for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
      }
    }
  }
  else
  {
    for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
      for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
        outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
      }
    }
  }
 #endif //#ifndef WRITE_PADDED_VALUES
 }
 #endif // Stride > 2
 #endif

 #endif

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(copyImage, Dtype)
    (__global Dtype* image_data,
     int_tp image_offset,
     const int_tp channels, const int_tp height, const int_tp width,
     const int_tp adjustedHeight, const int_tp adjustedWidth,
     const int_tp pad_h, const int_tp pad_w,
     __global Dtype* output_image,
     const int_tp output_offset,
     const int_tp batch_size) {

  uint_tp sX = get_global_id(0);
  uint_tp sY = get_global_id(1);
  uint_tp sZ = get_global_id(2);

  int_tp in_y = sY - pad_h;
  int_tp in_x = sX - pad_w;

  int_tp batch_offset = 0;
  int_tp adjusted_batch_offset = 0;
  for(uint_tp batch_idx = 0; batch_idx < batch_size; batch_idx++) {
    int_tp dst_offset = adjusted_batch_offset + output_offset + sZ*adjustedHeight*adjustedWidth + sY*adjustedWidth +sX;
    int_tp src_offset = batch_offset + image_offset + sZ*height*width + in_y*width + in_x;
    if((in_y >= 0 && in_y < height && in_x >= 0 && in_x < width))
      output_image[dst_offset] = image_data[src_offset];
    else
      output_image[dst_offset] = 0;
    batch_offset += height * width * channels;
    adjusted_batch_offset += adjustedHeight * adjustedWidth * channels;
  }
 }

 __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
    (__global Dtype* weightIn,
     __global Dtype* weightOut,
     const int_tp kernel_w,
     const int_tp kernel_h,
     const int_tp channels,
     const int_tp outputs,
     const int_tp swizzleFactor) {

  uint_tp sX = get_global_id(0);

  //Original location

  //Output location
  int_tp outputSublayer = channels / swizzleFactor;
  int_tp outputSublayerIndex = channels % swizzleFactor;

  int_tp filter = sX / (kernel_w*kernel_h*channels);
  int_tp kernel_X = sX % kernel_w;
  int_tp kernel_Y = (sX / kernel_w) % kernel_h;
  int_tp kernel_C = (sX / (kernel_w * kernel_h)) % channels;

  int_tp FP = filter / swizzleFactor;
  int_tp F1 = filter % swizzleFactor;

  weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1]
  = weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X];
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(crop_copy, Dtype)(const int_tp n, const int_tp height,
                                         const int_tp width,
                                         const int_tp src_outer_stride,
                                         const int_tp src_inner_stride,
                                         const int_tp dest_outer_stride,
                                         const int_tp dest_inner_stride,
                                         __global const Dtype* src,
                                         const int_tp src_off,
                                         __global Dtype* dest,
                                         const int_tp dest_off) {
  for (int_tp index = get_global_id(0); index < n;
      index += get_global_size(0)) {
    int_tp src_start = index / height * src_outer_stride
        + index % height * src_inner_stride;
    int_tp dest_start = index / height * dest_outer_stride
        + index % height * dest_inner_stride;
    for (int_tp i = 0; i < width; ++i) {
      dest[dest_off + dest_start + i] = src[src_off + src_start + i];
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(dropout_forward,Dtype)(const int_tp n,
                                              __global const Dtype* in,
                                              __global const uint_tp* mask,
                                              const uint_tp threshold,
                                              const Dtype scale,
                                              __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out[index] = in[index] * ((mask[index] > threshold)?1.0:0.0) * scale;
  }
 }

 __kernel void TEMPLATE(dropout_backward,Dtype)(
    const int_tp n, __global const Dtype* in_diff,
    __global const uint_tp* mask, const uint_tp threshold,
    const Dtype scale,
    __global Dtype* out_diff) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out_diff[index] = in_diff[index] * ((mask[index] > threshold)?1.0:0.0) * scale;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(eltwise_max_forward,Dtype)(
    const int_tp nthreads, __global const Dtype* bottom_data_a,
    __global const Dtype* bottom_data_b, const int_tp blob_idx,
    __global Dtype* top_data,
    __global int_tp* mask) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    Dtype maxval = -FLT_MAX;
    int_tp maxidx = -1;
    if (bottom_data_a[index] > bottom_data_b[index]) {
      // only update for very first bottom_data blob (blob_idx == 0)
      if (blob_idx == 0) {
        maxval = bottom_data_a[index];
        top_data[index] = maxval;
        maxidx = blob_idx;
        mask[index] = maxidx;
      }
    } else {
      maxval = bottom_data_b[index];
      top_data[index] = maxval;
      maxidx = blob_idx + 1;
      mask[index] = maxidx;
    }
  }
 }

 __kernel void TEMPLATE(eltwise_max_backward,Dtype)(const int_tp nthreads,
                                                   __global const Dtype* top_diff,
                                                   const int_tp blob_idx,
                                                   __global const int_tp* mask,
                                                   __global Dtype* bottom_diff) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    Dtype gradient = 0;
    if (mask[index] == blob_idx) {
      gradient += top_diff[index];
    }
    bottom_diff[index] = gradient;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(elu_forward,Dtype)(const int n, __global const Dtype* in,
                                          __global Dtype* out,
                                          Dtype alpha) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out[index] = in[index] > 0 ? in[index] : alpha * (exp(in[index]) - 1.0);
  }
 }

 __kernel void TEMPLATE(elu_backward,Dtype)(const int n, __global const Dtype* in_diff,
                                           __global const Dtype* out_data,
                                           __global const Dtype* in_data,
                                           __global Dtype* out_diff,
                                           Dtype alpha) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    out_diff[index] =
        in_data[index] > 0 ?
            in_diff[index] : in_diff[index] * (out_data[index] + alpha);
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(embed_forward,Dtype)(const int_tp nthreads,
                                            __global const Dtype* bottom_data,
                                            __global const Dtype* weight,
                                            const int_tp M, const int_tp N,
                                            const int_tp K,
                                            __global Dtype* top_data) {
  for (int_tp top_index = get_global_id(0); top_index < nthreads;
      top_index += get_global_size(0)) {
      const int_tp n = top_index / N;
      const int_tp d = top_index % N;
      const int_tp index = (int_tp)(bottom_data[n]);
      const int_tp weight_index = index * N + d;
      top_data[top_index] = weight[weight_index];
    }
  }

 // atomic_add from: http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html
 #if (TYPE == TYPE_FLOAT)
 inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) {
    union {
        uint_tp intVal;
        Dtype floatVal;
    } newVal;
    union {
        uint_tp intVal;
        Dtype floatVal;
    } prevVal;
    do {
        prevVal.floatVal = *source;
        newVal.floatVal = prevVal.floatVal + operand;
    } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
 }

 __kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data,
    __global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K,
    __global Dtype* weight_diff) {
  for (int_tp top_index = get_global_id(0); top_index < nthreads;
      top_index += get_global_size(0)) {
    const int_tp n = top_index / N;
    const int_tp d = top_index % N;
    const int_tp index = (int_tp)(bottom_data[n]);
    const int_tp weight_index = index * N + d;

    TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index));
  }
 }
 #endif

 #if (TYPE == TYPE_DOUBLE)
 #ifdef ATOMICS_64_AVAILABLE
 inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) {
    union {
        unsigned long intVal;
        Dtype floatVal;
    } newVal;
    union {
        unsigned long intVal;
        Dtype floatVal;
    } prevVal;
    do {
        prevVal.floatVal = *source;
        newVal.floatVal = prevVal.floatVal + operand;
    } while (atom_cmpxchg((volatile __global unsigned long *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
 }

 __kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data,
    __global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K,
    __global Dtype* weight_diff) {
  for (int_tp top_index = get_global_id(0); top_index < nthreads;
      top_index += get_global_size(0)) {
    const int_tp n = top_index / N;
    const int_tp d = top_index % N;
    const int_tp index = (int_tp)(bottom_data[n]);
    const int_tp weight_index = index * N + d;

    TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index));
  }
 }
 #endif
 #endif

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(fillbuffer,Dtype)(const int_tp n, const char alpha, __global char* x,
                                   const int_tp offx) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    x[index + offx] = alpha;
  }
 }

 __kernel void TEMPLATE(fill,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* x,
                                   const int_tp offx) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    x[index + offx] = alpha;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(im2col,Dtype)(const int_tp n,
                                     __global const Dtype* data_im,
                                     const int_tp data_im_off,
                                     const int_tp height, const int_tp width,
                                     const int_tp kernel_h,
                                     const int_tp kernel_w, const int_tp pad_h,
                                     const int_tp pad_w, const int_tp stride_h,
                                     const int_tp stride_w,
                                     const int_tp dilation_h,
                                     const int_tp dilation_w,
                                     const int_tp height_col,
                                     const int_tp width_col,
                                     __global Dtype* data_col,
                                     const int_tp data_col_off) {

  for (int_tp index = get_global_id(0); index < n;
      index += get_global_size(0)) {
    const int_tp h_index = index / width_col;
    const int_tp h_col = h_index % height_col;
    const int_tp w_col = index % width_col;
    const int_tp c_im = h_index / height_col;
    const int_tp c_col = c_im * kernel_h * kernel_w;
    const int_tp h_offset = h_col * stride_h - pad_h;
    const int_tp w_offset = w_col * stride_w - pad_w;
    __global Dtype* data_col_ptr = data_col + data_col_off;
    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
    __global const Dtype* data_im_ptr = data_im + data_im_off;
    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
    for (int_tp i = 0; i < kernel_h; ++i) {
      for (int_tp j = 0; j < kernel_w; ++j) {
        int_tp h_im = h_offset + i * dilation_h;
        int_tp w_im = w_offset + j * dilation_w;
        *data_col_ptr =
            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
                data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
        data_col_ptr += height_col * width_col;
      }
    }
  }
 }

 __kernel void TEMPLATE(col2im,Dtype)(const int_tp n,
                                     __global const Dtype* data_col,
                                     const int_tp data_col_off,
                                     const int_tp height, const int_tp width,
                                     const int_tp channels,
                                     const int_tp kernel_h,
                                     const int_tp kernel_w, const int_tp pad_h,
                                     const int_tp pad_w, const int_tp stride_h,
                                     const int_tp stride_w,
                                     const int_tp dilation_h,
                                     const int_tp dilation_w,
                                     const int_tp height_col,
                                     const int_tp width_col,
                                     __global Dtype* data_im,
                                     const int_tp data_im_off) {

  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    Dtype val = 0;
    const int_tp w_im = index % width + pad_w;
    const int_tp h_im = (index / width) % height + pad_h;
    const int_tp c_im = index / (width * height);
    int_tp kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
    int_tp kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
    // compute the start and end of the output
    const int_tp w_col_start =
        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
    const int_tp w_col_end = min(w_im / stride_w + 1, width_col);
    const int_tp h_col_start =
        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
    const int_tp h_col_end = min(h_im / stride_h + 1, height_col);
    // TODO: use LCM of stride and dilation to avoid unnecessary loops
    for (int_tp h_col = h_col_start; h_col < h_col_end; h_col += 1) {
      for (int_tp w_col = w_col_start; w_col < w_col_end; w_col += 1) {
        int_tp h_k = (h_im - h_col * stride_h);
        int_tp w_k = (w_im - w_col * stride_w);
        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
          h_k /= dilation_h;
          w_k /= dilation_w;
          int_tp data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
                                height_col + h_col) * width_col + w_col;
          val += data_col[data_col_off + data_col_index];
        }
      }
    }
    data_im[data_im_off + index] = val;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(im2col_nd, Dtype)(const int_tp n, const int_tp num_axes,
                                         const int_tp channel_axis,
                                         __global const Dtype* data_im,
                                         const int_tp data_im_off,
                                         __global const int_tp* im_shape,
                                         __global const int_tp* col_shape,
                                         __global const int_tp* kernel_shape,
                                         __global const int_tp* pad,
                                         __global const int_tp* stride,
                                         __global const int_tp* dilation,
                                         __global Dtype* data_col,
                                         const int_tp data_col_off) {
  int_tp d_temp[6];
  int_tp d_iter[6];
  int_tp i;

  __global const int_tp* im_shape_ptr = im_shape + channel_axis;
  __global const int_tp* col_shape_ptr = col_shape + channel_axis;

  __local int_tp shared_dilation[6];
  __local int_tp shared_kernel_shape[6];
  __local int_tp shared_pad[6];
  __local int_tp shared_stride[6];
  __local int_tp shared_col_shape[6 + 1];
  __local int_tp shared_im_shape[6 + 1];

  for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) {
    shared_dilation[li] = dilation[li];
    shared_kernel_shape[li] = kernel_shape[li];
    shared_pad[li] = pad[li];
    shared_stride[li] = stride[li];
  }

  for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) {
    shared_col_shape[li] = col_shape_ptr[li];
    shared_im_shape[li] = im_shape_ptr[li];
  }

  barrier(CLK_LOCAL_MEM_FENCE);

  for (int_tp index = get_global_id(0); index < n;
      index += get_global_size(0)) {
    // Initialize channel_in, computed in the loop below, with intermediate
    // computations used to compute the spatial indices.
    int_tp channel_in = index;
    int_tp channel_out = 1;
    for (i = num_axes - 1; i >= 0; --i) {
      d_temp[i] = channel_in % shared_col_shape[i + 1];
      channel_in /= shared_col_shape[i + 1];
      channel_out *= shared_kernel_shape[i];
    }
    channel_out *= channel_in;
    int_tp data_col_inc = 1;
    for (i = 0; i < num_axes; ++i) {
      channel_out *= shared_col_shape[i + 1];
      channel_out += d_temp[i];
      d_temp[i] = d_temp[i] * shared_stride[i] - shared_pad[i];
      channel_in *= shared_im_shape[i + 1];
      channel_in += d_temp[i];
      data_col_inc *= shared_col_shape[i + 1];
      d_iter[i] = 0;
    }
    __global Dtype* data_col_ptr = data_col + data_col_off + channel_out;
    __global const Dtype* data_im_ptr = data_im + data_im_off + channel_in;
    bool incremented;
    do {
      bool in_range = true;
      for (i = 0; i < num_axes; ++i) {
        const int_tp d_iter_im = d_iter[i] * shared_dilation[i] + d_temp[i];
        in_range &= d_iter_im >= 0 && d_iter_im < shared_im_shape[i + 1];
        if (!in_range) {
          break;
        }
      }
      if (in_range) {
        int_tp data_im_offset = d_iter[0] * shared_dilation[0];
        for (i = 1; i < num_axes; ++i) {
          data_im_offset *= shared_im_shape[i + 1];
          data_im_offset += d_iter[i] * shared_dilation[i];
        }
        *data_col_ptr = data_im_ptr[data_im_offset];
      } else {
        *data_col_ptr = 0;
      }
      data_col_ptr += data_col_inc;
      incremented = false;
      for (i = num_axes - 1; i >= 0; --i) {
        const int_tp d_max = shared_kernel_shape[i];
        if (d_iter[i] == d_max - 1) {
          d_iter[i] = 0;
        } else {  // d_iter[i] < d_max - 1
          ++d_iter[i];
          incremented = true;
          break;
        }
      }  // for (int_tp i = num_axes - 1; i >= 0; --i)
    } while (incremented);  // do
  }
 }

 __kernel void TEMPLATE(col2im_nd, Dtype)(const int_tp n, const int_tp num_axes,
                                         const int_tp channel_axis,
                                         __global const Dtype* data_col,
                                         const int_tp data_col_off,
                                         __global const int_tp* im_shape,
                                         __global const int_tp* col_shape,
                                         __global const int_tp* kernel_shape,
                                         __global const int_tp* pad,
                                         __global const int_tp* stride,
                                         __global const int_tp* dilation,
                                         __global Dtype* data_im,
                                         const int_tp data_im_off) {
  int_tp d_im[6];
  int_tp d_col_iter[6];
  int_tp d_col_start[6];
  int_tp d_col_end[6];

  __global const int_tp* im_shape_ptr = im_shape + channel_axis;
  __global const int_tp* col_shape_ptr = col_shape + channel_axis;

  __local int_tp shared_dilation[6];
  __local int_tp shared_kernel_shape[6];
  __local int_tp shared_pad[6];
  __local int_tp shared_stride[6];
  __local int_tp shared_col_shape[6 + 1];
  __local int_tp shared_im_shape[6 + 1];

  for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) {
    shared_dilation[li] = dilation[li];
    shared_kernel_shape[li] = kernel_shape[li];
    shared_pad[li] = pad[li];
    shared_stride[li] = stride[li];
  }
  for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) {
    shared_col_shape[li] = col_shape_ptr[li];
    shared_im_shape[li] = im_shape_ptr[li];
  }

  barrier(CLK_LOCAL_MEM_FENCE);

  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    // Initialize channel_in, computed in the loop below, with intermediate
    // computations used to compute the spatial indices.
    int_tp c_im = index;
    // Calculate d_im (image dimensions).
    for (int_tp i = num_axes - 1; i >= 0; --i) {
      d_im[i] = c_im % shared_im_shape[i + 1] + shared_pad[i];
      c_im /= shared_im_shape[i + 1];
    }
    // Calculate col start/end indices.
    bool done = false;
    for (int_tp i = 0; i < num_axes; ++i) {
      const int_tp kernel_extent = shared_dilation[i]
          * (shared_kernel_shape[i] - 1) + 1;
      d_col_start[i] = d_col_iter[i] =
          (d_im[i] < kernel_extent) ?
              0 : (d_im[i] - kernel_extent) / shared_stride[i] + 1;
      d_col_end[i] = min(d_im[i] / shared_stride[i] + 1,
                         shared_col_shape[i + 1]);
      if (d_col_start[i] >= d_col_end[i]) {
        // Skip computation if the dimension is 0 at any spatial axis --
        // final val will be 0.
        data_im[index] = 0;
        done = true;
        break;  // for (int_tp i = 0; i < num_axes; ++i)
      }
    }
    if (!done) {
      // Loop over the col to compute the output val.
      Dtype val = 0;
      bool incremented = true;
      bool skip = false;
      do {
        // Compute the final offset.
        int_tp final_offset = 0;
        int_tp kernel_shape_prod = 1;
        int_tp kernel_index;
        for (int_tp i = num_axes - 1; i >= 0; --i) {
          kernel_index = d_im[i] - d_col_iter[i] * shared_stride[i];
          if (kernel_index % shared_dilation[i]) {
            skip = true;
            break;
          } else {
            kernel_index /= shared_dilation[i];
            final_offset += kernel_index * kernel_shape_prod;
            kernel_shape_prod *= shared_kernel_shape[i];
          }
        }
        if (!skip) {
          final_offset += kernel_shape_prod * c_im;
          for (int_tp i = 0; i < num_axes; ++i) {
            final_offset *= shared_col_shape[i + 1];
            final_offset += d_col_iter[i];
          }
          val += data_col[data_col_off + final_offset];
        }
        skip = false;
        incremented = false;
        for (int_tp i = num_axes - 1; i >= 0; --i) {
          const int_tp d_max = d_col_end[i];
          if (d_col_iter[i] == d_max - 1) {
            d_col_iter[i] = d_col_start[i];
          } else {  // d_col_iter[i] < d_max - 1
            ++d_col_iter[i];
            incremented = true;
            break;  // for (int_tp i = num_axes - 1; i >= 0; --i)
          }
        }  // for (int_tp i = num_axes - 1; i >= 0; --i)
      } while (incremented);
      data_im[data_im_off + index] = val;
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(lrn_compute_output,Dtype)(const int_tp nthreads,
                                                 __global const Dtype* in,
                                                 __global const Dtype* scale,
                                                 const Dtype negative_beta,
                                                 __global Dtype* out) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    out[index] = in[index] * pow(scale[index], negative_beta);
  }
 }

 __kernel void TEMPLATE(lrn_fill_scale,Dtype)(const int_tp nthreads, __global const Dtype* in,
                             const int_tp num, const int_tp channels,
                             const int_tp height, const int_tp width, const int_tp size,
                             const Dtype alpha_over_size, const Dtype k,
                             __global Dtype* const scale) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    // find out the local offset
    const int_tp w = index % width;
    const int_tp h = (index / width) % height;
    const int_tp n = index / width / height;
    const int_tp offset = (n * channels * height + h) * width + w;
    const int_tp step = height * width;
    __global const Dtype* in_off = in + offset;
    __global Dtype* scale_off = scale + offset;
    int_tp head = 0;
    const int_tp pre_pad = (size - 1) / 2;
    const int_tp post_pad = size - pre_pad - 1;
    Dtype accum_scale = 0;
    // fill the scale at [n, :, h, w]
    // accumulate values
    while (head < post_pad && head < channels) {
      accum_scale += in_off[head * step] * in_off[head * step];
      ++head;
    }
    // both add and subtract
    while (head < channels) {
      accum_scale += in_off[head * step] * in_off[head * step];
      if (head - size >= 0) {
        accum_scale -= in_off[(head - size) * step]
            * in_off[(head - size) * step];
      }
      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
      ++head;
    }
    // subtract only
    while (head < channels + post_pad) {
      if (head - size >= 0) {
        accum_scale -= in_off[(head - size) * step]
            * in_off[(head - size) * step];
      }
      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
      ++head;
    }
  }
 }

 __kernel void TEMPLATE(lrn_compute_diff,Dtype)(const int_tp nthreads,
                               __global const Dtype* bottom_data,
                               __global const Dtype* top_data,
                               __global const Dtype* scale,
                               __global const Dtype* top_diff, const int_tp num,
                               const int_tp channels, const int_tp height,
                               const int_tp width, const int_tp size,
                               const Dtype negative_beta,
                               const Dtype cache_ratio,
                               __global Dtype* bottom_diff) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    // find out the local offset
    const int_tp w = index % width;
    const int_tp h = (index / width) % height;
    const int_tp n = index / width / height;
    const int_tp offset = (n * channels * height + h) * width + w;
    const int_tp step = height * width;
    __global const Dtype* bottom_off = bottom_data + offset;
    __global const Dtype* top_off = top_data + offset;
    __global const Dtype* scale_off = scale + offset;
    __global const Dtype* top_diff_off = top_diff + offset;
    __global Dtype* bottom_diff_off = bottom_diff + offset;
    int_tp head = 0;
    const int_tp pre_pad = size - (size + 1) / 2;
    const int_tp post_pad = size - pre_pad - 1;
    Dtype accum_ratio = 0;
    // accumulate values
    while (head < post_pad && head < channels) {
      accum_ratio += top_diff_off[head * step] * top_off[head * step]
          / scale_off[head * step];
      ++head;
    }
    // both add and subtract
    while (head < channels) {
      accum_ratio += top_diff_off[head * step] * top_off[head * step]
          / scale_off[head * step];
      if (head - size >= 0) {
        accum_ratio -= top_diff_off[(head - size) * step]
            * top_off[(head - size) * step] / scale_off[(head - size) * step];
      }
      bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad)
          * step] * pow(scale_off[(head - post_pad) * step], negative_beta)
          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
      ++head;
    }
    // subtract only
    while (head < channels + post_pad) {
      if (head - size >= 0) {
        accum_ratio -= top_diff_off[(head - size) * step]
            * top_off[(head - size) * step] / scale_off[(head - size) * step];
      }
      bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad)
          * step] * pow(scale_off[(head - post_pad) * step], negative_beta)
          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
      ++head;
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 inline Dtype TEMPLATE(lstm_sigmoid,Dtype)(const Dtype x) {
  return (Dtype)1 / ((Dtype)1 + exp(-x));
 }

 inline Dtype TEMPLATE(lstm_tanh,Dtype)(const Dtype x) {
  return (Dtype)2 * TEMPLATE(lstm_sigmoid,Dtype)((Dtype)2 * x) - (Dtype)1;
 }

 __kernel void TEMPLATE(lstm_acts_forward,Dtype)(const int_tp nthreads, const int_tp dim,
                                __global const Dtype* X, __global Dtype* X_acts) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp x_dim = 4 * dim;
    const int_tp d = index % x_dim;
    if (d < 3 * dim) {
      X_acts[index] = TEMPLATE(lstm_sigmoid,Dtype)(X[index]);
    } else {
      X_acts[index] = TEMPLATE(lstm_tanh,Dtype)(X[index]);
    }
  }
 }

 __kernel void TEMPLATE(lstm_unit_forward,Dtype)(const int_tp nthreads, const int_tp dim,
    __global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* cont,
    __global Dtype* C, __global Dtype* H) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp n = index / dim;
    const int_tp d = index % dim;
    __global const Dtype* X_offset = X + 4 * dim * n;
    const Dtype i = X_offset[d];
    const Dtype f = X_offset[1 * dim + d];
    const Dtype o = X_offset[2 * dim + d];
    const Dtype g = X_offset[3 * dim + d];
    const Dtype c_prev = C_prev[index];
    const Dtype c = cont[n] * f * c_prev + i * g;
    C[index] = c;
    const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c);
    H[index] = o * tanh_c;
  }
 }

 __kernel void TEMPLATE(lstm_unit_backward,Dtype)(const int_tp nthreads, const int_tp dim,
    __global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* C, __global const Dtype* H,
    __global const Dtype* cont, __global const Dtype* C_diff, __global const Dtype* H_diff,
    __global Dtype* C_prev_diff, __global Dtype* X_diff) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp n = index / dim;
    const int_tp d = index % dim;
    __global const Dtype* X_offset = X + 4 * dim * n;
    const Dtype i = X_offset[d];
    const Dtype f = X_offset[1 * dim + d];
    const Dtype o = X_offset[2 * dim + d];
    const Dtype g = X_offset[3 * dim + d];
    const Dtype c_prev = C_prev[index];
    const Dtype c = C[index];
    const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c);
    __global Dtype* c_prev_diff = C_prev_diff + index;
    __global Dtype* X_diff_offset = X_diff + 4 * dim * n;
    __global Dtype* i_diff = X_diff_offset + d;
    __global Dtype* f_diff = X_diff_offset + 1 * dim + d;
    __global Dtype* o_diff = X_diff_offset + 2 * dim + d;
    __global Dtype* g_diff = X_diff_offset + 3 * dim + d;
    const Dtype c_term_diff =
        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
    const Dtype cont_n = cont[n];
    *c_prev_diff = cont_n * c_term_diff * f;
    *i_diff = c_term_diff * g;
    *f_diff = cont_n * c_term_diff * c_prev;
    *o_diff = H_diff[index] * tanh_c;
    *g_diff = c_term_diff * i;
  }
 }

 __kernel void TEMPLATE(lstm_acts_backward,Dtype)(const int_tp nthreads, const int_tp dim,
          __global const Dtype* X_acts, __global const Dtype* X_acts_diff, __global Dtype* X_diff) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp x_dim = 4 * dim;
    const int_tp d = index % x_dim;
    const Dtype X_act = X_acts[index];
    if (d < 3 * dim) {
      X_diff[index] = X_acts_diff[index] * X_act * ((Dtype)1 - X_act);
    } else {
      X_diff[index] = X_acts_diff[index] * ((Dtype)1 - X_act * X_act);
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(mul,Dtype)(const int_tp n, __global const Dtype* a,
                                  const int_tp offa,
                                  __global Dtype* b,
                                  const int_tp offb, __global Dtype* y,
                                  const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[index + offy] = a[index + offa] * b[index + offb];
  }
 }

 __kernel void TEMPLATE(div,Dtype)(const int_tp n, __global const Dtype* a,
                                  const int_tp offa,
                                  __global Dtype* b,
                                  const int_tp offb, __global Dtype* y,
                                  const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[index + offy] = a[index + offa] / b[index + offb];
  }
 }

 __kernel void TEMPLATE(add_scalar,Dtype)(const int_tp N, const Dtype alpha,
 __global Dtype* Y,
                                         const int_tp offY) {
  for (int_tp index = get_global_id(0); index < N; index += get_global_size(0)) {
    Y[offY + index] += alpha;
  }
 }

 __kernel void TEMPLATE(add,Dtype)(const int_tp n, __global const Dtype* a,
                                  const int_tp offa, __global const Dtype* b,
                                  const int_tp offb, __global Dtype* y,
                                  const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[offy + index] = a[offa + index] + b[offb + index];
  }
 }

 __kernel void TEMPLATE(sub,Dtype)(const int_tp n, __global const Dtype* a,
                                  const int_tp offa, __global const Dtype* b,
                                  const int_tp offb, __global Dtype* y,
                                  const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[offy + index] = a[offa + index] - b[offb + index];
  }
 }

 __kernel void TEMPLATE(abs,Dtype)(const int_tp n, __global const Dtype* a,
                                  const int_tp offa, __global Dtype* y,
                                  const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[offy + index] = fabs((Dtype)(a[offa + index]));
  }
 }

 __kernel void TEMPLATE(exp,Dtype)(const int_tp n, __global const Dtype* a,
                                  const int_tp offa, __global Dtype* y,
                                  const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[offy + index] = exp(a[offa + index]);
  }
 }

 __kernel void TEMPLATE(log,Dtype)(const int_tp n, __global const Dtype* a,
                                  const int_tp offa, __global Dtype* y,
                                  const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[offy + index] = log((Dtype)(a[offa + index]));
  }
 }

 __kernel void TEMPLATE(powx,Dtype)(const int_tp n, __global const Dtype* a,
                                   const int_tp offa, Dtype alpha,
                                   __global Dtype* y,
                                   const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    if(alpha == 2.0) {
      y[offy + index] = pow((Dtype)fabs(a[offa + index]), (Dtype)alpha);
    } else {
      y[offy + index] = pow((Dtype)a[offa + index], (Dtype)alpha);
    }
  }
 }

 __kernel void TEMPLATE(sign,Dtype)(const int_tp n, __global const Dtype* x,
                                   const int_tp offx, __global Dtype* y,
                                   const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[index + offy] = (0.0 < x[index + offx])
        - (x[index + offx] < 0.0);
  }
 }

 __kernel void TEMPLATE(sgnbit,Dtype)(const int_tp n, __global const Dtype* x,
                                     const int_tp offx, __global Dtype* y,
                                     const int_tp offy) {
  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    y[index + offy] = signbit(x[index + offx]);
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(merge_copy_forward_stack, Dtype)(const int_tp nthreads,
                                                  const int_tp dims,
                                                  __global const Dtype* bottom_a,
                                                  const int_tp forward_a,
                                                  __global const Dtype* bottom_b,
                                                  const int_tp forward_b,
                                                  __global Dtype* top,
                                                  const int_tp num,
                                                  const int_tp channels_a,
                                                  const int_tp channels_b,
                                                  __global const int_tp* shape_a,
                                                  __global const int_tp* shape_b) {
  int_tp pad[6];
  int_tp tmp_idx[6];
  int_tp size_a = 1;
  int_tp size_b = 1;

  for (int_tp i = 0; i < dims; ++i) {
    pad[i] = (shape_b[i] - shape_a[i]) / 2;
    size_a *= shape_a[i];
    size_b *= shape_b[i];
  }

  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    int_tp batch_id = index / ((channels_a + channels_b) * size_a);
    int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a)
        / (channels_a * size_a)) % 2;
    int_tp counter = index;
    for (int_tp i = dims - 1; i >= 0; --i) {
      tmp_idx[i] = counter % shape_a[i];
      counter /= shape_a[i];
    }

    if (bottom_id == 0) {
      int_tp channel_id = (index / size_a) % channels_a;
      int_tp aidx = batch_id * channels_a + channel_id;
      for (int_tp i = 0; i < dims; ++i) {
        aidx *= shape_a[i];
        aidx += tmp_idx[i];
      }
      top[index] = (forward_a == 1) ? bottom_a[aidx] : 0;
    } else {
      int_tp channel_id = (index / size_a) % channels_b;
      int_tp bidx = (batch_id * channels_b + channel_id) * size_b;
      int_tp btemp = 1;
      for (int_tp i = dims - 1; i >= 0; --i) {
        bidx += btemp * (tmp_idx[i] + pad[i]);
        btemp *= shape_b[i];
      }
      top[index] = (forward_b == 1) ? bottom_b[bidx] : 0;
    }
  }
 }

 __kernel void TEMPLATE(merge_copy_backward_stack,Dtype)(const int_tp nthreads,
                                                  const int_tp dims,
                                                  __global Dtype* bottom_a,
                                                  const int_tp backward_a,
                                                  __global Dtype* bottom_b,
                                                  const int_tp backward_b,
                                                  __global const Dtype* top,
                                                  const int_tp num,
                                                  const int_tp channels_a,
                                                  const int_tp channels_b,
                                                  __global const int_tp* shape_a,
                                                  __global const int_tp* shape_b) {
  int_tp pad[6];
  int_tp tmp_idx[6];
  int_tp size_a = 1;
  int_tp size_b = 1;

  for (int_tp i = 0; i < dims; ++i) {
    pad[i] = (shape_b[i] - shape_a[i]) / 2;
    size_a *= shape_a[i];
    size_b *= shape_b[i];
  }

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {
    int_tp batch_id = index / ((channels_a + channels_b) * size_a);
    int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a)
        / (channels_a * size_a)) % 2;
    int_tp counter = index;
    for (int_tp i = dims - 1; i >= 0; --i) {
      tmp_idx[i] = counter % shape_a[i];
      counter /= shape_a[i];
    }

    if (bottom_id == 0) {
      int_tp channel_id = (index / size_a) % channels_a;
      int_tp aidx = batch_id * channels_a + channel_id;
      for (int_tp i = 0; i < dims; ++i) {
        aidx *= shape_a[i];
        aidx += tmp_idx[i];
      }
      bottom_a[aidx] = (backward_a == 1) ? top[index] : 0;
    } else {
      int_tp channel_id = (index / size_a) % channels_b;
      int_tp bidx = (batch_id * channels_b + channel_id) * size_b;
      int_tp btemp = 1;
      for (int_tp i = dims - 1; i >= 0; --i) {
        bidx += btemp * (tmp_idx[i] + pad[i]);
        btemp *= shape_b[i];
      }
      bottom_b[bidx] = (backward_b == 1) ? top[index] : 0;
    }
  }
 }


 __kernel void TEMPLATE(merge_copy_forward_add, Dtype)(const int_tp nthreads,
                                                  const int_tp dims,
                                                  __global const Dtype* bottom_a,
                                                  const int_tp forward_a,
                                                  __global const Dtype* bottom_b,
                                                  const int_tp forward_b,
                                                  __global Dtype* top,
                                                  const int_tp num,
                                                  const int_tp channels,
                                                  __global const int_tp* shape_a,
                                                  __global const int_tp* shape_b) {
  int_tp pad[6];
  int_tp tmp_idx[6];
  int_tp size_a = 1;
  int_tp size_b = 1;

  for (int_tp i = 0; i < dims; ++i) {
    pad[i] = (shape_b[i] - shape_a[i]) / 2;
    size_a *= shape_a[i];
    size_b *= shape_b[i];
  }

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {
    int_tp batch_id = index / (channels * size_a);
    int_tp counter = index;
    for (int_tp i = dims - 1; i >= 0; --i) {
      tmp_idx[i] = counter % shape_a[i];
      counter /= shape_a[i];
    }

    top[index] = 0;
    int_tp channel_id = (index / size_a) % channels;
    int_tp aidx = batch_id * channels + channel_id;
    for (int_tp i = 0; i < dims; ++i) {
      aidx *= shape_a[i];
      aidx += tmp_idx[i];
    }
    top[index] = forward_a ? top[index] + bottom_a[aidx] : top[index];
    int_tp bidx = (batch_id * channels + channel_id) * size_b;
    int_tp btemp = 1;
    for (int_tp i = dims - 1; i >= 0; --i) {
      bidx += btemp * (tmp_idx[i] + pad[i]);
      btemp *= shape_b[i];
    }
    top[index] = forward_b ? top[index] + bottom_b[bidx] : top[index];
  }
 }

 __kernel void TEMPLATE(merge_copy_backward_add,Dtype)(const int_tp nthreads,
                                                  const int_tp dims,
                                                  __global Dtype* bottom_a,
                                                  const int_tp backward_a,
                                                  __global Dtype* bottom_b,
                                                  const int_tp backward_b,
                                                  __global const Dtype* top,
                                                  const int_tp num,
                                                  const int_tp channels,
                                                  __global const int_tp* shape_a,
                                                  __global const int_tp* shape_b) {
  int_tp pad[6];
  int_tp tmp_idx[6];
  int_tp size_a = 1;
  int_tp size_b = 1;

  for (int_tp i = 0; i < dims; ++i) {
    pad[i] = (shape_b[i] - shape_a[i]) / 2;
    size_a *= shape_a[i];
    size_b *= shape_b[i];
  }

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {
    int_tp batch_id = index / (channels * size_a);
    int_tp counter = index;
    for (int_tp i = dims - 1; i >= 0; --i) {
      tmp_idx[i] = counter % shape_a[i];
      counter /= shape_a[i];
    }

    int_tp channel_id = (index / size_a) % channels;
    int_tp aidx = batch_id * channels + channel_id;
    for (int_tp i = 0; i < dims; ++i) {
      aidx *= shape_a[i];
      aidx += tmp_idx[i];
    }
    bottom_a[aidx] = backward_a ? top[index] : 0;
    int_tp bidx = (batch_id * channels + channel_id) * size_b;
    int_tp btemp = 1;
    for (int_tp i = dims - 1; i >= 0; --i) {
      bidx += btemp * (tmp_idx[i] + pad[i]);
      btemp *= shape_b[i];
    }
    bottom_b[bidx] = backward_b ? top[index] : 0;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(max_pool_forward,Dtype)(
    const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
    const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h,
    const int_tp pad_w,
    __global Dtype* top_data,
    const int use_mask, __global int_tp* mask, __global Dtype* top_mask) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp pw = index % pooled_width;
    const int_tp ph = (index / pooled_width) % pooled_height;
    const int_tp c = (index / pooled_width / pooled_height) % channels;
    const int_tp n = index / pooled_width / pooled_height / channels;
    int_tp hstart = ph * stride_h - pad_h;
    int_tp wstart = pw * stride_w - pad_w;
    const int_tp hend = min(hstart + kernel_h, height);
    const int_tp wend = min(wstart + kernel_w, width);
    hstart = max(hstart, (int_tp)0);
    wstart = max(wstart, (int_tp)0);
    Dtype maxval = -FLT_MAX;
    int_tp maxidx = -1;
    __global const Dtype* bottom_slice = bottom_data
        + (n * channels + c) * height * width;
    for (int_tp h = hstart; h < hend; ++h) {
      for (int_tp w = wstart; w < wend; ++w) {
        if (bottom_slice[h * width + w] > maxval) {
          maxidx = h * width + w;
          maxval = bottom_slice[maxidx];
        }
      }
    }
    top_data[index] = maxval;
    if (use_mask == 1) {
      mask[index] = maxidx;
    } else {
      top_mask[index] = maxidx;
    }
  }
 }

 __kernel void TEMPLATE(ave_pool_forward,Dtype)(
    const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
    const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h,
    const int_tp pad_w, __global Dtype* top_data) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    {
      const int_tp pw = index % pooled_width;
      const int_tp ph = (index / pooled_width) % pooled_height;
      const int_tp c = (index / pooled_width / pooled_height) % channels;
      const int_tp n = index / pooled_width / pooled_height / channels;
      int_tp hstart = ph * stride_h - pad_h;
      int_tp wstart = pw * stride_w - pad_w;
      int_tp hend = min(hstart + kernel_h, height + pad_h);
      int_tp wend = min(wstart + kernel_w, width + pad_w);
      const int_tp pool_size = (hend - hstart) * (wend - wstart);
      hstart = max(hstart, (int_tp)0);
      wstart = max(wstart, (int_tp)0);
      hend = min(hend, height);
      wend = min(wend, width);
      Dtype aveval = 0;
      __global const Dtype* bottom_slice = bottom_data
          + (n * channels + c) * height * width;
      for (int_tp h = hstart; h < hend; ++h) {
        for (int_tp w = wstart; w < wend; ++w) {
          aveval += bottom_slice[h * width + w];
        }
      }
      top_data[index] = aveval / pool_size;
    }
  }
 }

 __kernel void TEMPLATE(sto_pool_forward_train,Dtype)(
    const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
    const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w,
    __global Dtype* rand_idx,
    __global Dtype* top_data) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp pw = index % pooled_width;
    const int_tp ph = (index / pooled_width) % pooled_height;
    const int_tp c = (index / pooled_width / pooled_height) % channels;
    const int_tp n = index / pooled_width / pooled_height / channels;
    const int_tp hstart = ph * stride_h;
    const int_tp hend = min(hstart + kernel_h, height);
    const int_tp wstart = pw * stride_w;
    const int_tp wend = min(wstart + kernel_w, width);
    Dtype cumsum = 0.;
    __global const Dtype* bottom_slice = bottom_data
        + (n * channels + c) * height * width;
    // First pass: get sum
    for (int_tp h = hstart; h < hend; ++h) {
      for (int_tp w = wstart; w < wend; ++w) {
        cumsum += bottom_slice[h * width + w];
      }
    }
    const float thres = rand_idx[index] * cumsum;
    // Second pass: get value, and set index.
    cumsum = 0;
    for (int_tp h = hstart; h < hend; ++h) {
      for (int_tp w = wstart; w < wend; ++w) {
        cumsum += bottom_slice[h * width + w];
        if (cumsum >= thres) {
          rand_idx[index] = ((n * channels + c) * height + h) * width + w;
          top_data[index] = bottom_slice[h * width + w];
          h = hend;
          w = wend;
        }
      }
    }
  }
 }

 __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
    const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
    const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w,
    __global Dtype* top_data) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp pw = index % pooled_width;
    const int_tp ph = (index / pooled_width) % pooled_height;
    const int_tp c = (index / pooled_width / pooled_height) % channels;
    const int_tp n = index / pooled_width / pooled_height / channels;
    const int_tp hstart = ph * stride_h;
    const int_tp hend = min(hstart + kernel_h, height);
    const int_tp wstart = pw * stride_w;
    const int_tp wend = min(wstart + kernel_w, width);
    // We set cumsum to be 0 to avoid divide-by-zero problems
    Dtype cumsum = FLT_MIN;
    Dtype cumvalues = 0.;
    __global const Dtype* bottom_slice = bottom_data
        + (n * channels + c) * height * width;
    // First pass: get sum
    for (int_tp h = hstart; h < hend; ++h) {
      for (int_tp w = wstart; w < wend; ++w) {
        cumsum += bottom_slice[h * width + w];
        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
      }
    }
    top_data[index] = cumvalues / cumsum;
  }
 }

 __kernel void TEMPLATE(max_pool_backward,Dtype)(const int_tp nthreads,
                                                __global const Dtype* top_diff,
                                                const int use_mask,
                                                __global const int_tp* mask,
                                                __global const Dtype* top_mask,
                                                const int_tp num,
                                                const int_tp channels,
                                                const int_tp height,
                                                const int_tp width,
                                                const int_tp pooled_height,
                                                const int_tp pooled_width,
                                                const int_tp kernel_h,
                                                const int_tp kernel_w,
                                                const int_tp stride_h,
                                                const int_tp stride_w,
                                                const int_tp pad_h,
                                                const int_tp pad_w,
                                                __global Dtype* bottom_diff) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    // find out the local index
    // find out the local offset
    const int_tp w = index % width;
    const int_tp h = (index / width) % height;
    const int_tp c = (index / width / height) % channels;
    const int_tp n = index / width / height / channels;
    const int_tp phstart =
        (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
    const int_tp phend = min((h + pad_h) / stride_h + 1, pooled_height);
    const int_tp pwstart =
        (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
    const int_tp pwend = min((w + pad_w) / stride_w + 1, pooled_width);
    Dtype gradient = 0;
    const int_tp offset = (n * channels + c) * pooled_height * pooled_width;
    __global const Dtype* top_diff_slice = top_diff + offset;
    if (use_mask == 1) {
      __global const int_tp* mask_slice = mask + offset;
      for (int_tp ph = phstart; ph < phend; ++ph) {
        for (int_tp pw = pwstart; pw < pwend; ++pw) {
          if (mask_slice[ph * pooled_width + pw] == h * width + w) {
            gradient += top_diff_slice[ph * pooled_width + pw];
          }
        }
      }
    } else {
      __global const Dtype* top_mask_slice = top_mask + offset;
      for (int_tp ph = phstart; ph < phend; ++ph) {
        for (int_tp pw = pwstart; pw < pwend; ++pw) {
          if (top_mask_slice[ph * pooled_width + pw] == h * width + w) {
            gradient += top_diff_slice[ph * pooled_width + pw];
          }
        }
      }
    }
    bottom_diff[index] = gradient;
  }
 }

 __kernel void TEMPLATE(ave_pool_backward,Dtype)(const int_tp nthreads,
                                                __global const Dtype* top_diff,
                                                const int_tp num,
                                                const int_tp channels,
                                                const int_tp height,
                                                const int_tp width,
                                                const int_tp pooled_height,
                                                const int_tp pooled_width,
                                                const int_tp kernel_h,
                                                const int_tp kernel_w,
                                                const int_tp stride_h,
                                                const int_tp stride_w,
                                                const int_tp pad_h,
                                                const int_tp pad_w,
                                                __global Dtype* bottom_diff) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    // find out the local index
    // find out the local offset
    const int_tp w = index % width + pad_w;
    const int_tp h = (index / width) % height + pad_h;
    const int_tp c = (index / width / height) % channels;
    const int_tp n = index / width / height / channels;
    const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
    const int_tp phend = min(h / stride_h + 1, pooled_height);
    const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
    const int_tp pwend = min(w / stride_w + 1, pooled_width);
    Dtype gradient = 0.0;
    __global const Dtype* const top_diff_slice = top_diff
        + (n * channels + c) * pooled_height * pooled_width;
    for (int_tp ph = phstart; ph < phend; ++ph) {
      for (int_tp pw = pwstart; pw < pwend; ++pw) {
        // figure out the pooling size
        int_tp hstart = ph * stride_h - pad_h;
        int_tp wstart = pw * stride_w - pad_w;
        int_tp hend = min(hstart + kernel_h, height + pad_h);
        int_tp wend = min(wstart + kernel_w, width + pad_w);
        int_tp pool_size = (hend - hstart) * (wend - wstart);
        gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
      }
    }
    bottom_diff[index] = gradient;
  }
 }

 __kernel void TEMPLATE(sto_pool_backward,Dtype)(
    const int_tp nthreads, __global const Dtype* rand_idx,
    __global const Dtype* const top_diff, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width,
    const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h,
    const int_tp stride_w, __global Dtype* bottom_diff) {
  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {
    // find out the local index
    // find out the local offset
    const int_tp w = index % width;
    const int_tp h = (index / width) % height;
    const int_tp c = (index / width / height) % channels;
    const int_tp n = index / width / height / channels;
    const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
    const int_tp phend = min(h / stride_h + 1, pooled_height);
    const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
    const int_tp pwend = min(w / stride_w + 1, pooled_width);
    Dtype gradient = 0.0;
    __global const Dtype* rand_idx_slice = rand_idx
        + (n * channels + c) * pooled_height * pooled_width;
    __global const Dtype* top_diff_slice = top_diff
        + (n * channels + c) * pooled_height * pooled_width;
    for (int_tp ph = phstart; ph < phend; ++ph) {
      for (int_tp pw = pwstart; pw < pwend; ++pw) {
        gradient += top_diff_slice[ph * pooled_width + pw]
            * (index == (int_tp) (rand_idx_slice[ph * pooled_width + pw])?1.0:0.0);
      }
    }
    bottom_diff[index] = gradient;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(max_pool_forward_nd, Dtype)(const int_tp n,
                                                   const int_tp num_axes,
                                                   __global const Dtype* bottom_data,
                                                   const int_tp channels,
                                                   __global const int_tp* size,
                                                   __global const int_tp* pooled_size,
                                                   __global const int_tp* kernel_size,
                                                   __global const int_tp* ext_kernel_size,
                                                   __global const int_tp* stride,
                                                   __global const int_tp* dilation,
                                                   __global const int_tp* pad,
                                                   __global Dtype* top_data,
                                                   const int use_mask,
                                                   __global int_tp* mask, __global Dtype* top_mask) {
  int_tp d_idx[6];
  int_tp d_start[6];
  int_tp d_end[6];
  int_tp d_iter[6];
  int_tp i;

  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    int_tp offset = 1;
    int_tp num = index;

    bool do_continue = false;

    for (i = num_axes - 1; i >= 0; --i) {
      d_idx[i] = num % pooled_size[i];
      d_start[i] = d_idx[i] * stride[i] - pad[i];
      d_end[i] = min(d_start[i] + ext_kernel_size[i], size[i]);
      d_start[i] = max(d_start[i], (int_tp)0);
      num /= pooled_size[i];
      offset *= size[i];
      d_iter[i] = d_start[i];

      if (d_start[i] >= d_end[i]) {
        top_data[index] = -FLT_MAX;
        if (use_mask) {
          mask[index] = -1;
        } else {
          top_mask[index] = -1;
        }
        do_continue = true;
      }
    }

    if(do_continue) {
      continue;
    }

    int_tp chan = num % channels;
    num /= channels;
    offset *= (num * channels + chan);

    Dtype maxval = -FLT_MAX;
    int_tp maxidx = -1;
    int_tp final_offset = 0;

    bool incremented;
    do {
      final_offset = offset;
      int_tp size_prod = 1;
      for (i = num_axes - 1; i >= 0; --i) {
        final_offset += d_iter[i] * size_prod;
        size_prod *= size[i];
      }

      if (bottom_data[final_offset] > maxval) {
        maxidx = final_offset;
        maxval = bottom_data[maxidx];
      }

      incremented = false;
      for (i = num_axes - 1; i >= 0; --i) {
        if (d_iter[i] >= d_end[i] - dilation[i]) {
          d_iter[i] = d_start[i];
        } else {
          d_iter[i] += dilation[i];
          incremented = true;
          break;
        }
      }
    } while (incremented);

    top_data[index] = maxval;
    if (use_mask == 1) {
      mask[index] = maxidx;
    } else {
      top_mask[index] = maxidx;
    }
  }
 }


 __kernel void TEMPLATE(max_pool_backward_nd, Dtype)(const int_tp n,
                                                    const int_tp num_axes,
                                                    __global const Dtype* top_diff,
                                                    const int use_mask,
                                                    __global const int_tp* mask,
                                                    __global const Dtype* top_mask,
                                                    const int_tp channels,
                                                    __global const int_tp* size,
                                                    __global const int_tp* pooled_size,
                                                    __global const int_tp* kernel_size,
                                                    __global const int_tp* ext_kernel_size,
                                                    __global const int_tp* stride,
                                                    __global const int_tp* dilation,
                                                    __global const int_tp* pad,
                                                    __global Dtype* bottom_diff) {
  int_tp d_idx[6];
  int_tp d_start[6];
  int_tp d_end[6];
  int_tp d_iter[6];
  int_tp i;

  for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
    // find out the local index
    // find out the local offset
    int_tp offset = 1;
    int_tp num = index;
    for (i = num_axes - 1; i >= 0; --i) {
      d_idx[i] = num % size[i];
      if (dilation[i] > 1) {
        d_start[i] =
            (d_idx[i] < ext_kernel_size[i]) ?
                d_idx[i] % dilation[i] : (d_idx[i] - ext_kernel_size[i]) + 1;
        d_end[i] =
            (d_idx[i] >= pooled_size[i]) ?
                (pooled_size[i] - 1)
                    - (pooled_size[i] - 1 - d_start[i]) % dilation[i] :
                d_idx[i];
      } else {
        d_start[i] =
            (d_idx[i] + pad[i] < kernel_size[i]) ?
                0 : (d_idx[i] + pad[i] - kernel_size[i]) / stride[i] + 1;
        d_end[i] = min((int_tp) ((d_idx[i] + pad[i]) / stride[i] + 1),
                       (int_tp) (pooled_size[i]));
      }
      num /= size[i];
      offset *= pooled_size[i];
      d_iter[i] = d_start[i];

      if (d_start[i] > d_end[i]) {
        bottom_diff[index] = 0;
        return;
      }
    }
    int_tp chan = num % channels;
    num /= channels;
    offset *= (num * channels + chan);

    Dtype gradient = 0;
    int_tp final_offset = 0;
    int_tp im_offset = 0;

    bool incremented;
    do {
      final_offset = offset;
      im_offset = 0;
      int_tp size_prod = 1;
      int_tp pooled_size_prod = 1;
      for (i = num_axes - 1; i >= 0; --i) {
        final_offset += d_iter[i] * pooled_size_prod;
        im_offset += d_idx[i] * size_prod;
        size_prod *= size[i];
        pooled_size_prod *= pooled_size[i];
      }

      if (use_mask) {
        if (mask[final_offset] == im_offset) {
          gradient += top_diff[final_offset];
        }
      } else {
        if (top_mask[final_offset] == im_offset) {
          gradient += top_diff[final_offset];
        }
      }

      incremented = false;
      for (i = num_axes - 1; i >= 0; --i) {
        if (d_iter[i] > d_end[i] - dilation[i]) {
          d_iter[i] = d_start[i];
        } else {
          d_iter[i] += dilation[i];
          incremented = true;
          break;
        }
      }
    } while (incremented);
    bottom_diff[index] = gradient;
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int_tp nthreads,
 __global Dtype* bottom_data,
                                                  const int_tp num,
                                                  const int_tp channels,
                                                  const int_tp height,
                                                  const int_tp width,
                                                  const int_tp pooled_height,
                                                  const int_tp pooled_width,
                                                  const int_tp kernel_h,
                                                  const int_tp kernel_w,
                                                  const int_tp ext_kernel_h,
                                                  const int_tp ext_kernel_w,
                                                  const int_tp stride_h,
                                                  const int_tp stride_w,
                                                  const int_tp dilation_h,
                                                  const int_tp dilation_w,
                                                  const int_tp pad_h,
                                                  const int_tp pad_w,
                                                  __global Dtype* top_data,
                                                  const int use_mask,
                                                  __global int_tp* mask,
                                                  __global Dtype* top_mask) {
  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {
    int_tp pw = index % pooled_width;
    int_tp ph = (index / pooled_width) % pooled_height;
    int_tp c = (index / pooled_width / pooled_height) % channels;
    int_tp n = index / pooled_width / pooled_height / channels;
    int_tp hstart = ph * stride_h - pad_h;
    int_tp wstart = pw * stride_w - pad_w;
    int_tp hend = min(hstart + ext_kernel_h, height);
    int_tp wend = min(wstart + ext_kernel_w, width);
    hstart = max(hstart, (int_tp) 0);
    wstart = max(wstart, (int_tp) 0);
    Dtype maxval = -FLT_MAX;
    int_tp maxidx = -1;
    __global Dtype* bottom_data_ptr = bottom_data
        + (n * channels + c) * height * width;
    for (int_tp h = hstart; h < hend; h += dilation_h) {
      for (int_tp w = wstart; w < wend; w += dilation_w) {
        if (bottom_data_ptr[h * width + w] > maxval) {
          maxidx = h * width + w;
          maxval = bottom_data_ptr[maxidx];
        }
      }
    }
    top_data[index] = maxval;
    if (use_mask == 1) {
      mask[index] = maxidx;
    } else {
      top_mask[index] = maxidx;
    }
  }
 }

 __kernel void TEMPLATE(max_pool_backward_sk,Dtype)(
    const int_tp nthreads, __global const Dtype* top_diff, const int use_mask,
    __global const int_tp* mask, __global const Dtype* top_mask,
    const int_tp num, const int_tp channels, const int_tp height,
    const int_tp width, const int_tp pooled_height, const int_tp pooled_width,
    const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
    const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
    const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h,
    const int_tp pad_w,
    __global Dtype* bottom_diff) {

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {

    __global const int_tp* mask_ptr = mask;
    __global const Dtype* top_diff_ptr = top_diff;

 // find out the local index
 // find out the local offset
    int_tp w = index % width;
    int_tp h = (index / width) % height;
    int_tp c = (index / width / height) % channels;
    int_tp n = index / width / height / channels;

    int_tp pooled_height_1 = pooled_height - 1;
    int_tp pooled_width_1 = pooled_width - 1;
    int_tp phstart =
        (h < ext_kernel_h) ? h % dilation_h : (h - ext_kernel_h) + 1;
    int_tp phend =
        (h >= pooled_height) ?
            pooled_height_1 - (pooled_height_1 - phstart) % dilation_h : h;
    int_tp pwstart =
        (w < ext_kernel_w) ? w % dilation_w : (w - ext_kernel_w) + 1;
    int_tp pwend =
        (w >= pooled_width) ?
            pooled_width_1 - (pooled_width_1 - pwstart) % dilation_w : w;

    Dtype gradient = 0;
    int_tp offset = (n * channels + c) * pooled_height * pooled_width;
    top_diff_ptr += offset;
    if (use_mask == 1) {
      mask_ptr += offset;
      for (int_tp ph = phstart; ph <= phend; ph += dilation_h) {
        for (int_tp pw = pwstart; pw <= pwend; pw += dilation_w) {
          if (mask_ptr[ph * pooled_width + pw] == h * width + w) {
            gradient += top_diff_ptr[ph * pooled_width + pw];
          }
        }
      }
    } else {
      for (int_tp ph = phstart; ph <= phend; ph += dilation_h) {
        for (int_tp pw = pwstart; pw <= pwend; pw += dilation_w) {
          if (top_mask[ph * pooled_width + pw] == h * width + w) {
            gradient += top_diff_ptr[ph * pooled_width + pw];
          }
        }
      }
    }
    bottom_diff[index] = gradient;
  }
 }

 __kernel void TEMPLATE(ave_pool_forward_sk,Dtype)(
    const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width,
    const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
    const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
    const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h,
    const int_tp pad_w,
    __global Dtype* top_data) {

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {

    int_tp pw = index % pooled_width;
    int_tp ph = (index / pooled_width) % pooled_height;
    int_tp c = (index / pooled_width / pooled_height) % channels;
    int_tp n = index / pooled_width / pooled_height / channels;
    int_tp hstart = ph * stride_h - pad_h;
    int_tp wstart = pw * stride_w - pad_w;
    int_tp hend = min(hstart + ext_kernel_h, height + pad_h);
    int_tp wend = min(wstart + ext_kernel_w, width + pad_w);
    hstart = max(hstart, (int_tp)0);
    wstart = max(wstart, (int_tp)0);
    hend = min(hend, height);
    wend = min(wend, width);
    Dtype aveval = 0;
    __global const Dtype* bottom_data_ptr = bottom_data;
    bottom_data_ptr += (n * channels + c) * height * width;
    int_tp pool_size = 0;
    for (int_tp h = hstart; h < hend; ++h) {
      for (int_tp w = wstart; w < wend; ++w) {
        aveval += bottom_data_ptr[h * width + w];
        ++pool_size;
      }
    }
    top_data[index] = aveval / pool_size;
  }
 }

 __kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)(
    const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width,
    const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
    const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
    const int_tp dilation_h, const int_tp dilation_w, __global Dtype* rand_idx,
    __global Dtype* top_data) {

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {
    int_tp pw = index % pooled_width;
    int_tp ph = (index / pooled_width) % pooled_height;
    int_tp c = (index / pooled_width / pooled_height) % channels;
    int_tp n = index / pooled_width / pooled_height / channels;
    int_tp hstart = ph * stride_h;
    int_tp hend = min(hstart + ext_kernel_h, height);
    int_tp wstart = pw * stride_w;
    int_tp wend = min(wstart + ext_kernel_w, width);
    Dtype cumsum = 0.;
    __global const Dtype* bottom_data_ptr = bottom_data;
    bottom_data_ptr += (n * channels + c) * height * width;
    // First pass: get sum
    for (int_tp h = hstart; h < hend; h += dilation_h) {
      for (int_tp w = wstart; w < wend; w += dilation_w) {
        cumsum += bottom_data_ptr[h * width + w];
      }
    }
    float thres = rand_idx[index] * cumsum;
    // Second pass: get value, and set index.
    cumsum = 0;
    for (int_tp h = hstart; h < hend; h += dilation_h) {
      for (int_tp w = wstart; w < wend; w += dilation_w) {
        cumsum += bottom_data_ptr[h * width + w];
        if (cumsum >= thres) {
          rand_idx[index] = ((n * channels + c) * height + h) * width + w;
          top_data[index] = bottom_data_ptr[h * width + w];
          h = hend;
          w = wend;
        }
      }
    }
  }
 }

 __kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)(
    const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
    const int_tp channels, const int_tp height, const int_tp width,
    const int_tp pooled_height, const int_tp pooled_width,
    const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
    const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
    const int_tp dilation_h, const int_tp dilation_w,
    __global Dtype* top_data) {

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {
    int_tp pw = index % pooled_width;
    int_tp ph = (index / pooled_width) % pooled_height;
    int_tp c = (index / pooled_width / pooled_height) % channels;
    int_tp n = index / pooled_width / pooled_height / channels;
    int_tp hstart = ph * stride_h;
    int_tp hend = min(hstart + ext_kernel_h, height);
    int_tp wstart = pw * stride_w;
    int_tp wend = min(wstart + ext_kernel_w, width);
    // We set cumsum to be 0 to avoid divide-by-zero problems
    Dtype cumsum = FLT_MIN;
    Dtype cumvalues = 0.;
    __global const Dtype* bottom_data_ptr = bottom_data;
    bottom_data_ptr += (n * channels + c) * height * width;
    // First pass: get sum
    for (int_tp h = hstart; h < hend; h += dilation_h) {
      for (int_tp w = wstart; w < wend; w += dilation_w) {
        cumsum += bottom_data_ptr[h * width + w];
        cumvalues += bottom_data_ptr[h * width + w]
            * bottom_data_ptr[h * width + w];
      }
    }
    top_data[index] = cumvalues / cumsum;
  }

 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(slice,Dtype)(const int_tp nthreads,
                                    __global const Dtype* in_data,
                                    const int forward, const int_tp num_slices,
                                    const int_tp slice_size,
                                    const int_tp bottom_slice_axis,
                                    const int_tp top_slice_axis,
                                    const int_tp offset_slice_axis,
                                    __global Dtype* out_data) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp total_slice_size = slice_size * top_slice_axis;
    const int_tp slice_num = index / total_slice_size;
    const int_tp slice_index = index % total_slice_size;
    const int_tp bottom_index = slice_index
        + (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
    if (forward == 1) {
      out_data[index] = in_data[bottom_index];
    } else {
      out_data[bottom_index] = in_data[index];
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(softmax_loss_forward,Dtype)(
    int_tp n, __global const Dtype* prob_data, __global const Dtype* label,
    __global Dtype* loss,
    const int_tp num, const int_tp dim, const int_tp spatial_dim,
    const int has_ignore_label_, const int_tp ignore_label_,
    __global Dtype* counts) {

  for (int_tp index = get_global_id(0); index < n;
      index += get_global_size(0)) {
    const int_tp n = index / spatial_dim;
    const int_tp s = index % spatial_dim;
    const int_tp label_value = (int_tp) (label[n * spatial_dim + s]);
    if (has_ignore_label_ == 1 && label_value == ignore_label_) {
      loss[index] = 0;
      counts[index] = 0;
    } else {
      loss[index] = -log((Dtype)(
          max((Dtype) (prob_data[n * dim + label_value * spatial_dim + s]),
              (Dtype) FLT_MIN)));
      counts[index] = 1;
    }
  }
 }

 __kernel void TEMPLATE(softmax_loss_backward,Dtype)(const int_tp nthreads,
                                                    __global const Dtype* top,
                                                    __global const Dtype* label,
                                                    __global Dtype* bottom_diff,
                                                    const int_tp num,
                                                    const int_tp dim,
                                                    const int_tp spatial_dim,
                                                    const int has_ignore_label_,
                                                    const int_tp ignore_label_,
                                                    __global Dtype* counts) {

  const int_tp channels = dim / spatial_dim;

  for (int_tp index = get_global_id(0); index < nthreads; index +=
      get_global_size(0)) {

    const int_tp n = index / spatial_dim;
    const int_tp s = index % spatial_dim;
    const int_tp label_value = (int_tp) (label[n * spatial_dim + s]);

    if (has_ignore_label_ == 1 && label_value == ignore_label_) {
      for (int_tp c = 0; c < channels; ++c) {
        bottom_diff[n * dim + c * spatial_dim + s] = 0;
      }
      counts[index] = 0;
    } else {
      bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
      counts[index] = 1;
    }
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif

 __kernel void TEMPLATE(ada_delta_update,Dtype)(int_tp N, __global Dtype* g,
                                               __global Dtype* h,
                                               __global Dtype* h2,
                                               Dtype momentum,
                                               Dtype delta,
                                               Dtype local_rate) {
  for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
    Dtype gi = g[i];
    Dtype hi = h[i] = momentum * h[i] + (1.0 - momentum) * gi * gi;
    gi = gi * sqrt((h2[i] + delta) / (hi + delta));
    h2[i] = momentum * h2[i] + (1.0 - momentum) * gi * gi;
    g[i] = local_rate * gi;
  }
 }

 __kernel void TEMPLATE(ada_grad_update,Dtype)(int_tp N, __global Dtype* g,
                                              __global Dtype* h,
                                              Dtype delta,
                                              Dtype local_rate) {
  for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
    Dtype gi = g[i];
    Dtype hi = h[i] = h[i] + gi * gi;
    g[i] = local_rate * gi / (sqrt(hi) + delta);
  }
 }

 __kernel void TEMPLATE(adam_update,Dtype)(int_tp N, __global Dtype* g,
                                          __global Dtype* m,
                                          __global Dtype* v,
                                          Dtype beta1,
                                          Dtype beta2,
                                          Dtype eps_hat,
                                          Dtype corrected_local_rate) {
  for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
    Dtype gi = g[i];
    Dtype mi = m[i] = m[i] * beta1 + gi * (1 - beta1);
    Dtype vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2);
    g[i] = corrected_local_rate * mi / (sqrt(vi) + eps_hat);
  }
 }


 __kernel void TEMPLATE(nesterov_update,Dtype)(int_tp N, __global Dtype* g,
                                              __global Dtype* h,
                                              Dtype momentum,
                                              Dtype local_rate) {
  for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
    Dtype hi = h[i];
    Dtype hi_new = h[i] = momentum * hi + local_rate * g[i];
    g[i] = (1 + momentum) * hi_new - momentum * hi;
  }
 }

 __kernel void TEMPLATE(rms_prop_update,Dtype)(int_tp N, __global Dtype* g,
                                              __global Dtype* h,
                                              Dtype rms_decay,
                                              Dtype delta,
                                              Dtype local_rate) {
  for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
    Dtype gi = g[i];
    Dtype hi = h[i] = rms_decay * h[i] + (1 - rms_decay) * gi * gi;
    g[i] = local_rate * g[i] / (sqrt(hi) + delta);
  }
 }

 __kernel void TEMPLATE(sgd_update,Dtype)(int_tp N, __global Dtype* g,
                                         __global Dtype* h,
                                         Dtype momentum,
                                         Dtype local_rate) {
  for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
    g[i] = h[i] = momentum * h[i] + local_rate * g[i];
  }
 }

 #ifndef __OPENCL_VERSION__
 #include "header.cl"
 #endif


 __kernel void TEMPLATE(tile,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data,
                                   const int_tp tile_size, const int_tp num_tiles,
                                   const int_tp bottom_tile_axis,
                                   __global Dtype* top_data) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp d = index % tile_size;
    const int_tp b = (index / tile_size / num_tiles) % bottom_tile_axis;
    const int_tp n = index / tile_size / num_tiles / bottom_tile_axis;
    const int_tp bottom_index = (n * bottom_tile_axis + b) * tile_size + d;
    top_data[index] = bottom_data[bottom_index];
  }
 }


 __kernel void TEMPLATE(tile_backward,Dtype)(const int_tp nthreads,
                                            __global const Dtype* top_diff,
                                            const int_tp tile_size,
                                            const int_tp num_tiles,
                                            const int_tp bottom_tile_axis,
                                            __global Dtype* bottom_diff) {
  for (int_tp index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    const int_tp d = index % tile_size;
    const int_tp b = (index / tile_size) % bottom_tile_axis;
    const int_tp n = index / tile_size / bottom_tile_axis;
    bottom_diff[index] = 0;
    int_tp top_index = (n * num_tiles * bottom_tile_axis + b) * tile_size + d;
    for (int_tp t = 0; t < num_tiles; ++t) {
      bottom_diff[index] += top_diff[top_index];
      top_index += bottom_tile_axis * tile_size;
    }
  }
 }

 #endif  // DOUBLE_SUPPORT_AVAILABLE


 Note: Google Test filter = *OpenCLKernelCompileTest*
 [==========] Running 2 tests from 2 test cases.
 [----------] Global test environment set-up.
 [----------] 1 test from OpenCLKernelCompileTest/0, where TypeParam = float
 [ RUN      ] OpenCLKernelCompileTest/0.TestCompile
 Kernel bundle: activation: OK
 Kernel bundle: auxiliary: OK
 Kernel bundle: batch_reindex: OK
 Kernel bundle: benchmark: OK
 Kernel bundle: bias: OK
 Kernel bundle: bnll: OK
 Kernel bundle: channel: OK
 Kernel bundle: concat: OK
 Kernel bundle: contrastive_loss: OK
 Kernel bundle: conv_layer_spatial: OK
 Kernel bundle: conv_spatial_helper: OK
 Kernel bundle: crop: OK
 Kernel bundle: dropout: OK
 Kernel bundle: eltwise: OK
 Kernel bundle: elu: OK
 BUILD LOG: 
 (92:0) : error : atomic function "atomic_cmpxchg" not supported

 Kernel bundle: fft: OK
 Kernel bundle: fillbuffer: OK
 Kernel bundle: im2col: OK
 Kernel bundle: im2col_nd: OK
 Kernel bundle: lrn: OK
 Kernel bundle: lstm_unit: OK
 Kernel bundle: math: OK
 Kernel bundle: mergecrop: OK
 Kernel bundle: pooling: OK
 Kernel bundle: pooling_nd: OK
 Kernel bundle: pooling_sk: OK
 Kernel bundle: slice: OK
 Kernel bundle: softmax_loss: OK
 Kernel bundle: solvers: OK
 Kernel bundle: tile: OK
 src/caffe/test/test_ocl_kernel_compile.cpp:79: Failure
 Value of: failure
  Actual: true
 Expected: false
 [  FAILED  ] OpenCLKernelCompileTest/0.TestCompile, where TypeParam = float (775 ms)
 [----------] 1 test from OpenCLKernelCompileTest/0 (775 ms total)

 [----------] 1 test from OpenCLKernelCompileTest/1, where TypeParam = double
 [ RUN      ] OpenCLKernelCompileTest/1.TestCompile
 [       OK ] OpenCLKernelCompileTest/1.TestCompile (0 ms)
 [----------] 1 test from OpenCLKernelCompileTest/1 (1 ms total)

 [----------] Global test environment tear-down
 [==========] 2 tests from 2 test cases ran. (777 ms total)
 [  PASSED  ] 1 test.
 [  FAILED  ] 1 test, listed below:
 [  FAILED  ] OpenCLKernelCompileTest/0.TestCompile, where TypeParam = float

 1 FAILED TEST