luan-cestari · February 25, 2016 10:39
diff --git a/OpenCL or CLTorch problem b/OpenCL or CLTorch problem
 ~/torch ⮀ master ⮀ 7d17h28m ⮀ uname -a                                                                                                                                                              ⮂ ruby-2.3.0 
 Linux localhost.localdomain 4.3.5-300.fc23.x86_64 #1 SMP Mon Feb 1 03:18:41 UTC 2016 x86_64 x86_64 x86_64 GNU/Linux

 ~/torch ⮀ master ⮀ 7d17h25m ⮀ clinfo                                                                                                                                                                ⮂ ruby-2.3.0 
 Number of platforms                               1
  Platform Name                                   Clover
  Platform Vendor                                 Mesa
  Platform Version                                OpenCL 1.1 MESA 11.1.0
  Platform Profile                                FULL_PROFILE
  Platform Extensions                             cl_khr_icd
  Platform Extensions function suffix             MESA

  Platform Name                                   Clover
 Number of devices                                 1
  Device Name                                     AMD HAWAII (DRM 2.43.0, LLVM 3.7.0)
  Device Vendor                                   AMD
  Device Vendor ID                                0x1002
  Device Version                                  OpenCL 1.1 MESA 11.1.0
  Driver Version                                  11.1.0
  Device OpenCL C Version                         OpenCL C 1.1 
  Device Type                                     GPU
  Device Profile                                  FULL_PROFILE
  Max compute units                               40
  Max clock frequency                             1020MHz
  Max work item dimensions                        3
  Max work item sizes                             256x256x256
  Max work group size                             256
  Preferred work group size multiple              64
  Preferred / native vector sizes                 
    char                                                16 / 16      
    short                                                8 / 8       
    int                                                  4 / 4       
    long                                                 2 / 2       
    half                                                 0 / 0        (n/a)
    float                                                4 / 4       
    double                                               2 / 2        (cl_khr_fp64)
  Half-precision Floating-point support           (n/a)
  Single-precision Floating-point support         (core)
    Denormals                                     No
    Infinity and NANs                             Yes
    Round to nearest                              Yes
    Round to zero                                 No
    Round to infinity                             No
    IEEE754-2008 fused multiply-add               No
    Support is emulated in software               No
    Correctly-rounded divide and sqrt operations  No
  Double-precision Floating-point support         (cl_khr_fp64)
    Denormals                                     Yes
    Infinity and NANs                             Yes
    Round to nearest                              Yes
    Round to zero                                 Yes
    Round to infinity                             Yes
    IEEE754-2008 fused multiply-add               Yes
    Support is emulated in software               No
    Correctly-rounded divide and sqrt operations  No
  Address bits                                    32, Little-Endian
  Global memory size                              1073741824 (1024MiB)
  Error Correction support                        No
  Max memory allocation                           268435456 (256MiB)
  Unified memory for Host and Device              Yes
  Minimum alignment for any data type             128 bytes
  Alignment of base address                       1024 bits (128 bytes)
  Global Memory cache type                        None
  Image support                                   No
  Local memory type                               Local
  Local memory size                               32768 (32KiB)
  Max constant buffer size                        268435456 (256MiB)
  Max number of constant args                     16
  Max size of kernel argument                     1024
  Queue properties                                
    Out-of-order execution                        No
    Profiling                                     Yes
  Profiling timer resolution                      0ns
  Execution capabilities                          
    Run OpenCL kernels                            Yes
    Run native kernels                            No
  Device Available                                Yes
  Compiler Available                              Yes
  Device Extensions                               cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_fp64

 NULL platform behavior
  clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...)  Clover
  clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...)   Success [MESA]
  clCreateContext(NULL, ...) [default]            Success [MESA]
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU)  No devices found in platform
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU)  Success (1)
    Platform Name                                 Clover
    Device Name                                   AMD HAWAII (DRM 2.43.0, LLVM 3.7.0)
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR)  No devices found in platform
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM)  No devices found in platform
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL)  Success (1)
    Platform Name                                 Clover
    Device Name                                   AMD HAWAII (DRM 2.43.0, LLVM 3.7.0)

 ICD loader properties
  ICD loader Name                                 OpenCL ICD Loader
  ICD loader Vendor                               OCL Icd free software
  ICD loader Version                              2.2.7
  ICD loader Profile                              OpenCL 1.2
 ~/torch ⮀ master ⮀ 7d17h25m ⮀ luajit -l clnn -e 'clnn.test()'                                                                                                                                       ⮂ ruby-2.3.0 
 libthclnn_searchpath	/home/lcestari/torch/install/lib/lua/5.1/libTHCLNN.so
 Running 68 tests
 |___________________________________________________________________  ==> Abs_backwardUsing Mesa , OpenCL platform: Clover
 Using OpenCL device: AMD HAWAII (DRM 2.43.0, LLVM 3.7.0)
 ____|_______________________________________________________________  ==> ClassNLLCriterionMultipleTargetTHClReduceAll.cl build log: 
 unsupported call to function reduceBlock in THClTensor_reduceAll
 kernel build error:

 kernel source:
 1: inline unsigned int THClCeilDiv(unsigned int a, unsigned int b) {
 2:   return (a + b - 1) / b;
 3: }
 4: 
 5: 
 6: 
 7: inline float modifyOp(float _in1) {
 8:   float _out;
 9:   float *in1 = &_in1;
 10:   float *out = &_out;
 11:   *out = *in1;
 12:   return _out;
 13: }
 14: 
 15: inline float reduceOp(float _in1, float _in2) {
 16:   // I guess the compiler can sort this stuff out :-P
 17:   float _out;
 18:   float *in1 = &_in1;
 19:   float *in2 = &_in2;
 20:   float *out = &_out;
 21:   *out = *in1 + *in2;
 22:   return _out;
 23: }
 24: 
 25: // kernel argument that defines tensor layout
 26: typedef struct TensorInfoCl {
 27:   // Extracts size/stride information for the kernel.
 28:   // Successive dimensions can be collapsed if the size/strides match
 29:   // up and thus there are no holes between the dimensions. This is used
 30:   // to reduce the complexity of the problem.
 31:   // The optional `reduceDim` indicates a reduction dimension for the
 32:   // given tensor, so that the output size for this dimension will be 1.
 33: 
 34:   unsigned int sizes[25];
 35:   unsigned int strides[25];
 36:   unsigned int offset;
 37:   int dims;
 38: } TensorInfoCl;
 39: // Contiguous tensors of more than one dimension are collapsed down
 40: // to one tensor
 41: 
 42: 
 43: // Translate a linear index for the apply to a float* offset;
 44: // specialized on `Dims` to reduce nvcc compilation time
 45: 
 46: 
 47: inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) {
 48:     return linearId + info->offset;
 49: }
 50: 
 51: inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) {
 52:   unsigned int offset = info->offset;
 53: 
 54:   // Use dynamic dims
 55:   for (int i = info->dims - 1; i >= 0; --i) {
 56:     unsigned int curDimIndex = linearId % info->sizes[i];
 57:     unsigned int curDimOffset = curDimIndex * info->strides[i];
 58:     offset += curDimOffset;
 59: 
 60:     linearId /= info->sizes[i];
 61:   }
 62: 
 63:   return offset;
 64: }
 65: 
 66: inline unsigned int getLinearBlockId() {
 67:   return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
 68:     get_group_id(1) * get_num_groups(0) +
 69:     get_group_id(0);
 70: }
 71: 
 72: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will
 73: // return the reduced value
 74: 
 75: inline float reduceBlock( local float* smem,
 76:                    int numVals,
 77:                    float threadVal,
 78:                    float init) {
 79:   if (numVals == 0) {
 80:     return init;
 81:   }
 82: 
 83:   if ((int)get_local_id(0) < numVals) {
 84:     smem[ get_local_id(0)] = threadVal;
 85:   }
 86: 
 87:   // First warp will perform reductions across warps
 88:   barrier(CLK_LOCAL_MEM_FENCE);
 89:   if ((get_local_id(0) / 32) == 0) {
 90:     float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init;
 91: 
 92:     for (int i = 32 + get_local_id(0); i < numVals; i += 32) {
 93:       r = reduceOp(r, smem[i]);
 94:     }
 95: 
 96:     smem[get_local_id(0)] = r;
 97:   }
 98: 
 99:   // First thread will perform reductions across the block
 100:   barrier(CLK_LOCAL_MEM_FENCE);
 101: 
 102:   float r = init;
 103:   if (get_local_id(0) == 0) {
 104:     r = smem[0];
 105: 
 106:     int numLanesParticipating = min(numVals, 32);
 107: 
 108:     if (numLanesParticipating == 32) {
 109:       // Unroll for 32 == 32 and numVals >= 32
 110:       // #pragma unroll
 111:       // unrolling by hand, so compiler-independent
 112:       
 113:         r = reduceOp(r, smem[1]);
 114:       
 115:         r = reduceOp(r, smem[2]);
 116:       
 117:         r = reduceOp(r, smem[3]);
 118:       
 119:         r = reduceOp(r, smem[4]);
 120:       
 121:         r = reduceOp(r, smem[5]);
 122:       
 123:         r = reduceOp(r, smem[6]);
 124:       
 125:         r = reduceOp(r, smem[7]);
 126:       
 127:         r = reduceOp(r, smem[8]);
 128:       
 129:         r = reduceOp(r, smem[9]);
 130:       
 131:         r = reduceOp(r, smem[10]);
 132:       
 133:         r = reduceOp(r, smem[11]);
 134:       
 135:         r = reduceOp(r, smem[12]);
 136:       
 137:         r = reduceOp(r, smem[13]);
 138:       
 139:         r = reduceOp(r, smem[14]);
 140:       
 141:         r = reduceOp(r, smem[15]);
 142:       
 143:         r = reduceOp(r, smem[16]);
 144:       
 145:         r = reduceOp(r, smem[17]);
 146:       
 147:         r = reduceOp(r, smem[18]);
 148:       
 149:         r = reduceOp(r, smem[19]);
 150:       
 151:         r = reduceOp(r, smem[20]);
 152:       
 153:         r = reduceOp(r, smem[21]);
 154:       
 155:         r = reduceOp(r, smem[22]);
 156:       
 157:         r = reduceOp(r, smem[23]);
 158:       
 159:         r = reduceOp(r, smem[24]);
 160:       
 161:         r = reduceOp(r, smem[25]);
 162:       
 163:         r = reduceOp(r, smem[26]);
 164:       
 165:         r = reduceOp(r, smem[27]);
 166:       
 167:         r = reduceOp(r, smem[28]);
 168:       
 169:         r = reduceOp(r, smem[29]);
 170:       
 171:         r = reduceOp(r, smem[30]);
 172:       
 173:         r = reduceOp(r, smem[31]);
 174:       
 175:     } else {
 176:       for (int i = 1; i < numLanesParticipating; ++i) {
 177:         r = reduceOp(r, smem[i]);
 178:       }
 179:     }
 180:   }
 181: 
 182:   return r;
 183: }
 184: 
 185: 
 186: 
 187: 
 188: // Kernel that handles an entire reduction of a tensor in one pass
 189: kernel void
 190: THClTensor_reduceAll(global TensorInfoCl *in_info,
 191:                      global float *in_data,
 192:                      unsigned int totalElements,
 193:                      float init,
 194:                      global float* out,
 195:                      local float *smem) {
 196:   // With a block-wide stride, have each thread perform its own reduction.
 197:   float r = init;
 198:   for (unsigned int i = get_local_id(0); i < totalElements; i += get_local_size(0)) {
 199:     const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]);
 200:     r = reduceOp(r, modifyOp(in_data[inOffset]));
 201:   }
 202: 
 203:   // Reduce within the block
 204:   r = reduceBlock(smem, get_local_size(0), r, init);
 205: 
 206:   if(get_local_id(0) == 0) {
 207:     // Write out reduced value
 208:     out[0] = r;
 209:   }
 210: }
 211: 
 212: inline unsigned int getStartIndex(unsigned int totalSize) {
 213:   unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0));
 214:   return get_group_id(0) * sizePerBlock;
 215: }
 216: 
 217: inline unsigned int getEndIndex(unsigned int totalSize) {
 218:   unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0));
 219:   return min((unsigned int) ((get_group_id(0) + 1) * sizePerBlock), totalSize);
 220: }
 221: 
 222: // Kernel that handles an entire reduction of a tensor in two passes
 223: kernel void
 224: THClTensor_reduceAllPass1(global TensorInfoCl *in_info,
 225:                           global float *in_data,
 226:                           unsigned int totalElements,
 227:                           float init,
 228:                           global float* scratchSpace,
 229:                           local float *smem) {
 230:   const unsigned int startIndex = getStartIndex(totalElements);
 231:   const unsigned int endIndex = getEndIndex(totalElements);
 232: 
 233:   // With a block-wide stride, have each thread perform its own reduction.
 234:   float r = init;
 235:   for (unsigned int i = startIndex + get_local_id(0); i < endIndex; i += get_local_size(0)) {
 236:     const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]);
 237:     r = reduceOp(r, modifyOp(in_data[inOffset]));
 238:   }
 239: 
 240:   // Reduce within the block
 241:   r = reduceBlock(smem, get_local_size(0), r, init);
 242: 
 243:   if ((int)get_local_id(0) == 0) {
 244:     // Write out block-wide reduced value
 245:     scratchSpace[get_group_id(0)] = r;
 246:   }
 247: }
 248: 
 249: kernel void THClTensor_reduceAllPass2(int numPass1Blocks,
 250:                             float init,
 251:                             global float* scratchSpace,
 252:                             global float* out,
 253:                             local float *smem) {
 254:   float r = init;
 255:   if ((int)get_local_id(0) < numPass1Blocks) {
 256:     r = scratchSpace[get_local_id(0)];
 257:   }
 258: 
 259:   // Reduce within the block
 260:   r = reduceBlock(smem, numPass1Blocks, r, init);
 261: 
 262:   if((int)get_local_id(0) == 0) {
 263:     out[0] = r;
 264:   }
 265: }
 266: 
 267: 
 268: 
 269: 


 Something went wrong with clCreateKernel, OpenCL erorr code -45
 THClReduceAll.cl build log: 
 unsupported call to function reduceBlock in THClTensor_reduceAll
 ____*________|______________________________________________________  ==> LogSoftMax_backwardTHClTensorMathTransformReduce.cl build log: 
 input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier

 kernel build error:

 kernel source:
 1: // from lib/THC/THCTensorMathTransformReduce.cu:
 2: 
 3: typedef struct Pair {
 4:    float first;
 5:    float second;
 6: } Pair;
 7: 
 8: static Pair binary_op( Pair a, Pair b ) {
 9:     if( a.first > b.first ) { return a; } else { return b; };
 10: }
 11: 
 12: /* A set of reduction kernels that take in binary ops on thrust pairs (of value, index).
 13:    These are useful when you not only have to do a reduction, but you might have
 14:    to preserve the location of contention (for example min/max operations).
 15:    The structure of the kernels follows the structure of the reduction kernels.
 16: */
 17: kernel void THClTensor_kernel_transformReduceOuterDimIndex(
 18:     global float *tgt1_data, int tgt1_offset,
 19:     global float *tgt2_data, int tgt2_offset,
 20:     global float *src__data, int src__offset,
 21:     int num_orows, int num_irows, int row_size
 22:   ) {
 23:   global float *tgt1 = tgt1_data + tgt1_offset;
 24:   global float *tgt2 = tgt2_data + tgt2_offset;
 25:   global float *src_ = src__data + src__offset;
 26: 
 27:   for (int orow = get_group_id(0); orow < num_orows; orow += get_num_groups(0)) {
 28:     for (int irow = get_group_id(1) * get_local_size(0) + get_local_id(0); irow < num_irows; irow += get_num_groups(1) * get_local_size(0)) {
 29:       global float *src = src_ + orow * row_size * num_irows + irow;
 30:       Pair acc = {.first=-3.40282e+38f, .second=-1};
 31:       for (int col = 0; col < row_size; ++col) {
 32:         Pair lhs = {*src, col+1};
 33:         acc = binary_op( lhs, acc);
 34: //         acc = binary_op(thrust::make_pair(*src, col+1), acc); // i+1 for 1-indexing
 35:         src += num_irows;
 36:       }
 37:       tgt1[orow * num_irows + irow] = acc.first;
 38:       tgt2[orow * num_irows + irow] = acc.second;
 39:     }
 40:   }
 41: }
 42: 
 43: /* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index))
 44:  *
 45:  * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension:
 46:  *
 47:  * - block.x is the innermost dimension, i.e. dimension 0;
 48:  * - block.y and grid.y make up dimension 1; and
 49:  * - grid.x and grid z are the remaining two outer dimensions (if any)
 50:  *
 51:  * Reduction along other dimensions is handled in a separate kernel.
 52:  */
 53: kernel void THClTensor_kernel_transformReduceInnermostDimIndex(
 54:     global float *tgt1_data, int tgt1_offset,
 55:     global float *tgt2_data, int tgt2_offset,
 56:     global float *src__data, int src__offset,
 57:     int num_rows, int row_size
 58:   ) {
 59:   global float *tgt1 = tgt1_data + tgt1_offset;
 60:   global float *tgt2 = tgt2_data + tgt2_offset;
 61:   global float *src_ = src__data + src__offset;
 62: 
 63:   local float sbuf[16][16];
 64:   local float ibuf[16][16];
 65: 
 66:   for (int block_row = get_group_id(0) * get_local_size(1); block_row < num_rows; block_row += get_local_size(1) * get_num_groups(0)) {
 67:     int row = block_row + get_local_id(1);
 68: //     thrust::pair<float,float> acc = init;
 69:     Pair acc = { .first=-3.40282e+38f, .second=-1 };
 70:     if (row < num_rows) {
 71:       global float *src = src_ + row * row_size;
 72:       // Sequential reduction within a thread.
 73:       for (int col = get_local_id(0); col < row_size; col += get_local_size(0)) {
 74:         Pair lhs = {src[col], col+1};
 75:         acc = binary_op(lhs, acc);
 76:       }
 77:     }
 78: 
 79:     sbuf[get_local_id(1)][get_local_id(0)] = acc.first;
 80:     ibuf[get_local_id(1)][get_local_id(0)] = acc.second;
 81: 
 82:     // Reduce intermediate values to single value.
 83:     local float* sline = &sbuf[get_local_id(1)][0];
 84:     local float* iline = &ibuf[get_local_id(1)][0];
 85:     for (int s = 8; s > 0; s >>= 1) {
 86:       if (row < num_rows && (int)get_local_id(0) < s) {
 87:         Pair arg1 = {.first=sline[get_local_id(0)], .second=iline[get_local_id(0)]};
 88:         Pair arg2 = {.first=sline[get_local_id(0) + s], .second=iline[get_local_id(0) + s]};
 89:         Pair res = binary_op(arg1, arg2);
 90:         sline[get_local_id(0)] = res.first;
 91:         iline[get_local_id(0)] = res.second;
 92:       }
 93:       barrier(CLK_LOCAL_MEM_FENCE);
 94:     }
 95: 
 96:     if (row < num_rows && get_local_id(0) == 0) {
 97:       tgt1[row] = sline[0];
 98:       tgt2[row] = iline[0];
 99:     }
 100:     barrier(CLK_LOCAL_MEM_FENCE);
 101:   }
 102: }
 103: 
 104: 


 Something went wrong with clCreateKernel, OpenCL erorr code -45
 THClTensorMathTransformReduce.cl build log: 
 input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier

 ____*________*|_____________________________________________________  ==> LogSoftMax_backward_batchTHClTensorMathTransformReduce.cl build log: 
 input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier

 kernel build error:

 kernel source:
 1: // from lib/THC/THCTensorMathTransformReduce.cu:
 2: 
 3: typedef struct Pair {
 4:    float first;
 5:    float second;
 6: } Pair;
 7: 
 8: static Pair binary_op( Pair a, Pair b ) {
 9:     if( a.first > b.first ) { return a; } else { return b; };
 10: }
 11: 
 12: /* A set of reduction kernels that take in binary ops on thrust pairs (of value, index).
 13:    These are useful when you not only have to do a reduction, but you might have
 14:    to preserve the location of contention (for example min/max operations).
 15:    The structure of the kernels follows the structure of the reduction kernels.
 16: */
 17: kernel void THClTensor_kernel_transformReduceOuterDimIndex(
 18:     global float *tgt1_data, int tgt1_offset,
 19:     global float *tgt2_data, int tgt2_offset,
 20:     global float *src__data, int src__offset,
 21:     int num_orows, int num_irows, int row_size
 22:   ) {
 23:   global float *tgt1 = tgt1_data + tgt1_offset;
 24:   global float *tgt2 = tgt2_data + tgt2_offset;
 25:   global float *src_ = src__data + src__offset;
 26: 
 27:   for (int orow = get_group_id(0); orow < num_orows; orow += get_num_groups(0)) {
 28:     for (int irow = get_group_id(1) * get_local_size(0) + get_local_id(0); irow < num_irows; irow += get_num_groups(1) * get_local_size(0)) {
 29:       global float *src = src_ + orow * row_size * num_irows + irow;
 30:       Pair acc = {.first=-3.40282e+38f, .second=-1};
 31:       for (int col = 0; col < row_size; ++col) {
 32:         Pair lhs = {*src, col+1};
 33:         acc = binary_op( lhs, acc);
 34: //         acc = binary_op(thrust::make_pair(*src, col+1), acc); // i+1 for 1-indexing
 35:         src += num_irows;
 36:       }
 37:       tgt1[orow * num_irows + irow] = acc.first;
 38:       tgt2[orow * num_irows + irow] = acc.second;
 39:     }
 40:   }
 41: }
 42: 
 43: /* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index))
 44:  *
 45:  * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension:
 46:  *
 47:  * - block.x is the innermost dimension, i.e. dimension 0;
 48:  * - block.y and grid.y make up dimension 1; and
 49:  * - grid.x and grid z are the remaining two outer dimensions (if any)
 50:  *
 51:  * Reduction along other dimensions is handled in a separate kernel.
 52:  */
 53: kernel void THClTensor_kernel_transformReduceInnermostDimIndex(
 54:     global float *tgt1_data, int tgt1_offset,
 55:     global float *tgt2_data, int tgt2_offset,
 56:     global float *src__data, int src__offset,
 57:     int num_rows, int row_size
 58:   ) {
 59:   global float *tgt1 = tgt1_data + tgt1_offset;
 60:   global float *tgt2 = tgt2_data + tgt2_offset;
 61:   global float *src_ = src__data + src__offset;
 62: 
 63:   local float sbuf[16][16];
 64:   local float ibuf[16][16];
 65: 
 66:   for (int block_row = get_group_id(0) * get_local_size(1); block_row < num_rows; block_row += get_local_size(1) * get_num_groups(0)) {
 67:     int row = block_row + get_local_id(1);
 68: //     thrust::pair<float,float> acc = init;
 69:     Pair acc = { .first=-3.40282e+38f, .second=-1 };
 70:     if (row < num_rows) {
 71:       global float *src = src_ + row * row_size;
 72:       // Sequential reduction within a thread.
 73:       for (int col = get_local_id(0); col < row_size; col += get_local_size(0)) {
 74:         Pair lhs = {src[col], col+1};
 75:         acc = binary_op(lhs, acc);
 76:       }
 77:     }
 78: 
 79:     sbuf[get_local_id(1)][get_local_id(0)] = acc.first;
 80:     ibuf[get_local_id(1)][get_local_id(0)] = acc.second;
 81: 
 82:     // Reduce intermediate values to single value.
 83:     local float* sline = &sbuf[get_local_id(1)][0];
 84:     local float* iline = &ibuf[get_local_id(1)][0];
 85:     for (int s = 8; s > 0; s >>= 1) {
 86:       if (row < num_rows && (int)get_local_id(0) < s) {
 87:         Pair arg1 = {.first=sline[get_local_id(0)], .second=iline[get_local_id(0)]};
 88:         Pair arg2 = {.first=sline[get_local_id(0) + s], .second=iline[get_local_id(0) + s]};
 89:         Pair res = binary_op(arg1, arg2);
 90:         sline[get_local_id(0)] = res.first;
 91:         iline[get_local_id(0)] = res.second;
 92:       }
 93:       barrier(CLK_LOCAL_MEM_FENCE);
 94:     }
 95: 
 96:     if (row < num_rows && get_local_id(0) == 0) {
 97:       tgt1[row] = sline[0];
 98:       tgt2[row] = iline[0];
 99:     }
 100:     barrier(CLK_LOCAL_MEM_FENCE);
 101:   }
 102: }
 103: 
 104: 


 Something went wrong with clCreateKernel, OpenCL erorr code -45
 THClTensorMathTransformReduce.cl build log: 
 input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier

 ____*________**|____________________________________________________  ==> LogSoftMax_forwardTHClTensorMathTransformReduce.cl build log: 
 input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier

 kernel build error:

 kernel source:
 1: // from lib/THC/THCTensorMathTransformReduce.cu:
 2: 
 3: typedef struct Pair {
 4:    float first;
 5:    float second;
 6: } Pair;
 7: 
 8: static Pair binary_op( Pair a, Pair b ) {
 9:     if( a.first > b.first ) { return a; } else { return b; };
 10: }
 11: 
 12: /* A set of reduction kernels that take in binary ops on thrust pairs (of value, index).
 13:    These are useful when you not only have to do a reduction, but you might have
 14:    to preserve the location of contention (for example min/max operations).
 15:    The structure of the kernels follows the structure of the reduction kernels.
 16: */
 17: kernel void THClTensor_kernel_transformReduceOuterDimIndex(
 18:     global float *tgt1_data, int tgt1_offset,
 19:     global float *tgt2_data, int tgt2_offset,
 20:     global float *src__data, int src__offset,
 21:     int num_orows, int num_irows, int row_size
 22:   ) {
 23:   global float *tgt1 = tgt1_data + tgt1_offset;
 24:   global float *tgt2 = tgt2_data + tgt2_offset;
 25:   global float *src_ = src__data + src__offset;
 26: 
 27:   for (int orow = get_group_id(0); orow < num_orows; orow += get_num_groups(0)) {
 28:     for (int irow = get_group_id(1) * get_local_size(0) + get_local_id(0); irow < num_irows; irow += get_num_groups(1) * get_local_size(0)) {
 29:       global float *src = src_ + orow * row_size * num_irows + irow;
 30:       Pair acc = {.first=-3.40282e+38f, .second=-1};
 31:       for (int col = 0; col < row_size; ++col) {
 32:         Pair lhs = {*src, col+1};
 33:         acc = binary_op( lhs, acc);
 34: //         acc = binary_op(thrust::make_pair(*src, col+1), acc); // i+1 for 1-indexing
 35:         src += num_irows;
 36:       }
 37:       tgt1[orow * num_irows + irow] = acc.first;
 38:       tgt2[orow * num_irows + irow] = acc.second;
 39:     }
 40:   }
 41: }
 42: 
 43: /* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index))
 44:  *
 45:  * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension:
 46:  *
 47:  * - block.x is the innermost dimension, i.e. dimension 0;
 48:  * - block.y and grid.y make up dimension 1; and
 49:  * - grid.x and grid z are the remaining two outer dimensions (if any)
 50:  *
 51:  * Reduction along other dimensions is handled in a separate kernel.
 52:  */
 53: kernel void THClTensor_kernel_transformReduceInnermostDimIndex(
 54:     global float *tgt1_data, int tgt1_offset,
 55:     global float *tgt2_data, int tgt2_offset,
 56:     global float *src__data, int src__offset,
 57:     int num_rows, int row_size
 58:   ) {
 59:   global float *tgt1 = tgt1_data + tgt1_offset;
 60:   global float *tgt2 = tgt2_data + tgt2_offset;
 61:   global float *src_ = src__data + src__offset;
 62: 
 63:   local float sbuf[16][16];
 64:   local float ibuf[16][16];
 65: 
 66:   for (int block_row = get_group_id(0) * get_local_size(1); block_row < num_rows; block_row += get_local_size(1) * get_num_groups(0)) {
 67:     int row = block_row + get_local_id(1);
 68: //     thrust::pair<float,float> acc = init;
 69:     Pair acc = { .first=-3.40282e+38f, .second=-1 };
 70:     if (row < num_rows) {
 71:       global float *src = src_ + row * row_size;
 72:       // Sequential reduction within a thread.
 73:       for (int col = get_local_id(0); col < row_size; col += get_local_size(0)) {
 74:         Pair lhs = {src[col], col+1};
 75:         acc = binary_op(lhs, acc);
 76:       }
 77:     }
 78: 
 79:     sbuf[get_local_id(1)][get_local_id(0)] = acc.first;
 80:     ibuf[get_local_id(1)][get_local_id(0)] = acc.second;
 81: 
 82:     // Reduce intermediate values to single value.
 83:     local float* sline = &sbuf[get_local_id(1)][0];
 84:     local float* iline = &ibuf[get_local_id(1)][0];
 85:     for (int s = 8; s > 0; s >>= 1) {
 86:       if (row < num_rows && (int)get_local_id(0) < s) {
 87:         Pair arg1 = {.first=sline[get_local_id(0)], .second=iline[get_local_id(0)]};
 88:         Pair arg2 = {.first=sline[get_local_id(0) + s], .second=iline[get_local_id(0) + s]};
 89:         Pair res = binary_op(arg1, arg2);
 90:         sline[get_local_id(0)] = res.first;
 91:         iline[get_local_id(0)] = res.second;
 92:       }
 93:       barrier(CLK_LOCAL_MEM_FENCE);
 94:     }
 95: 
 96:     if (row < num_rows && get_local_id(0) == 0) {
 97:       tgt1[row] = sline[0];
 98:       tgt2[row] = iline[0];
 99:     }
 100:     barrier(CLK_LOCAL_MEM_FENCE);
 101:   }
 102: }
 103: 
 104: 


 Something went wrong with clCreateKernel, OpenCL erorr code -45
 THClTensorMathTransformReduce.cl build log: 
 input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier

 ____*________***|___________________________________________________  ==> LogSoftMax_forward_batchTHClTensorMathTransformReduce.cl build log: 
 input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier

 kernel build error:

 kernel source:
 1: // from lib/THC/THCTensorMathTransformReduce.cu:
 2: 
 3: typedef struct Pair {
 4:    float first;
 5:    float second;
 6: } Pair;
 7: 
 8: static Pair binary_op( Pair a, Pair b ) {
 9:     if( a.first > b.first ) { return a; } else { return b; };
 10: }
 11: 
 12: /* A set of reduction kernels that take in binary ops on thrust pairs (of value, index).
 13:    These are useful when you not only have to do a reduction, but you might have
 14:    to preserve the location of contention (for example min/max operations).
 15:    The structure of the kernels follows the structure of the reduction kernels.
 16: */
 17: kernel void THClTensor_kernel_transformReduceOuterDimIndex(
 18:     global float *tgt1_data, int tgt1_offset,
 19:     global float *tgt2_data, int tgt2_offset,
 20:     global float *src__data, int src__offset,
 21:     int num_orows, int num_irows, int row_size
 22:   ) {
 23:   global float *tgt1 = tgt1_data + tgt1_offset;
 24:   global float *tgt2 = tgt2_data + tgt2_offset;
 25:   global float *src_ = src__data + src__offset;
 26: 
 27:   for (int orow = get_group_id(0); orow < num_orows; orow += get_num_groups(0)) {
 28:     for (int irow = get_group_id(1) * get_local_size(0) + get_local_id(0); irow < num_irows; irow += get_num_groups(1) * get_local_size(0)) {
 29:       global float *src = src_ + orow * row_size * num_irows + irow;
 30:       Pair acc = {.first=-3.40282e+38f, .second=-1};
 31:       for (int col = 0; col < row_size; ++col) {
 32:         Pair lhs = {*src, col+1};
 33:         acc = binary_op( lhs, acc);
 34: //         acc = binary_op(thrust::make_pair(*src, col+1), acc); // i+1 for 1-indexing
 35:         src += num_irows;
 36:       }
 37:       tgt1[orow * num_irows + irow] = acc.first;
 38:       tgt2[orow * num_irows + irow] = acc.second;
 39:     }
 40:   }
 41: }
 42: 
 43: /* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index))
 44:  *
 45:  * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension:
 46:  *
 47:  * - block.x is the innermost dimension, i.e. dimension 0;
 48:  * - block.y and grid.y make up dimension 1; and
 49:  * - grid.x and grid z are the remaining two outer dimensions (if any)
 50:  *
 51:  * Reduction along other dimensions is handled in a separate kernel.
 52:  */
 53: kernel void THClTensor_kernel_transformReduceInnermostDimIndex(
 54:     global float *tgt1_data, int tgt1_offset,
 55:     global float *tgt2_data, int tgt2_offset,
 56:     global float *src__data, int src__offset,
 57:     int num_rows, int row_size
 58:   ) {
 59:   global float *tgt1 = tgt1_data + tgt1_offset;
 60:   global float *tgt2 = tgt2_data + tgt2_offset;
 61:   global float *src_ = src__data + src__offset;
 62: 
 63:   local float sbuf[16][16];
 64:   local float ibuf[16][16];
 65: 
 66:   for (int block_row = get_group_id(0) * get_local_size(1); block_row < num_rows; block_row += get_local_size(1) * get_num_groups(0)) {
 67:     int row = block_row + get_local_id(1);
 68: //     thrust::pair<float,float> acc = init;
 69:     Pair acc = { .first=-3.40282e+38f, .second=-1 };
 70:     if (row < num_rows) {
 71:       global float *src = src_ + row * row_size;
 72:       // Sequential reduction within a thread.
 73:       for (int col = get_local_id(0); col < row_size; col += get_local_size(0)) {
 74:         Pair lhs = {src[col], col+1};
 75:         acc = binary_op(lhs, acc);
 76:       }
 77:     }
 78: 
 79:     sbuf[get_local_id(1)][get_local_id(0)] = acc.first;
 80:     ibuf[get_local_id(1)][get_local_id(0)] = acc.second;
 81: 
 82:     // Reduce intermediate values to single value.
 83:     local float* sline = &sbuf[get_local_id(1)][0];
 84:     local float* iline = &ibuf[get_local_id(1)][0];
 85:     for (int s = 8; s > 0; s >>= 1) {
 86:       if (row < num_rows && (int)get_local_id(0) < s) {
 87:         Pair arg1 = {.first=sline[get_local_id(0)], .second=iline[get_local_id(0)]};
 88:         Pair arg2 = {.first=sline[get_local_id(0) + s], .second=iline[get_local_id(0) + s]};
 89:         Pair res = binary_op(arg1, arg2);
 90:         sline[get_local_id(0)] = res.first;
 91:         iline[get_local_id(0)] = res.second;
 92:       }
 93:       barrier(CLK_LOCAL_MEM_FENCE);
 94:     }
 95: 
 96:     if (row < num_rows && get_local_id(0) == 0) {
 97:       tgt1[row] = sline[0];
 98:       tgt2[row] = iline[0];
 99:     }
 100:     barrier(CLK_LOCAL_MEM_FENCE);
 101:   }
 102: }
 103: 
 104: 


 Something went wrong with clCreateKernel, OpenCL erorr code -45
 THClTensorMathTransformReduce.cl build log: 
 input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier

 ____*________****|__________________________________________________  ==> LookupTable_backwardnDim	97	nInput	10	batch	false	error	0
 nDim	97	nInput	10	batch	true	error	0
 nDim	97	nInput	101	batch	false	error	0
 nDim	97	nInput	101	batch	true	error	0
 nDim	255	nInput	10	batch	false	error	0
 nDim	255	nInput	10	batch	true	error	0
 nDim	255	nInput	101	batch	false	error	0
 nDim	255	nInput	101	batch	true	error	0
 ____*________****______________________________________|____________  ==> Sum_backwardTHClReduce.cl build log:          
 unsupported call to function reduceBlock in THClTensor_reduceContigDim
 kernel build error:

 kernel source:
 1: // Threads per thread block
 2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
 3: 
 4: inline float modifyOp(float _in1) {
 5:   float _out;
 6:   float *in1 = &_in1;
 7:   float *out = &_out;
 8:   *out = *in1;
 9:   return _out;
 10: }
 11: 
 12: inline float reduceOp(float _in1, float _in2) {
 13:   // I guess the compiler can sort this stuff out :-P
 14:   float _out;
 15:   float *in1 = &_in1;
 16:   float *in2 = &_in2;
 17:   float *out = &_out;
 18:   *out = *in1 + *in2;
 19:   return _out;
 20: }
 21: 
 22: // kernel argument that defines tensor layout
 23: typedef struct TensorInfoCl {
 24:   // Extracts size/stride information for the kernel.
 25:   // Successive dimensions can be collapsed if the size/strides match
 26:   // up and thus there are no holes between the dimensions. This is used
 27:   // to reduce the complexity of the problem.
 28:   // The optional `reduceDim` indicates a reduction dimension for the
 29:   // given tensor, so that the output size for this dimension will be 1.
 30: 
 31:   int sizes[25];
 32:   int strides[25];
 33:   int offset;
 34:   int dims;
 35: } TensorInfoCl;
 36: // Contiguous tensors of more than one dimension are collapsed down
 37: // to one tensor
 38: 
 39: 
 40: // Translate a linear index for the apply to a float* offset;
 41: // specialized on `Dims` to reduce nvcc compilation time
 42: 
 43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) {
 44:   int offset = info->offset;
 45: 
 46:   // Use static dims
 47: //  for (int i = 1 - 1; i >= 0; --i) {
 48:   int curDimIndex;
 49:   int curDimOffset;
 50:     // bake this in....
 51:     curDimIndex = linearId % info->sizes[0];
 52:     curDimOffset = curDimIndex * info->strides[0];
 53:     offset += curDimOffset;
 54: 
 55:     
 56:   
 57: //  }
 58: 
 59:   return offset;
 60: }
 61: 
 62: 
 63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) {
 64:     return linearId + info->offset;
 65: }
 66: 
 67: inline int IndexToOffset_999_get(int linearId, global const TensorInfoCl *info) {
 68:   int offset = info->offset;
 69: 
 70:   // Use dynamic dims
 71:   for (int i = info->dims - 1; i >= 0; --i) {
 72:     int curDimIndex = linearId % info->sizes[i];
 73:     int curDimOffset = curDimIndex * info->strides[i];
 74:     offset += curDimOffset;
 75: 
 76:     linearId /= info->sizes[i];
 77:   }
 78: 
 79:   return offset;
 80: }
 81: 
 82: inline int getLinearBlockId() {
 83:   return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
 84:     get_group_id(1) * get_num_groups(0) +
 85:     get_group_id(0);
 86: }
 87: 
 88: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will
 89: // return the reduced value
 90: 
 91: inline float reduceBlock( local float* smem,
 92:                    int numVals,
 93:                    float threadVal,
 94:                    float init) {
 95:   if (numVals == 0) {
 96:     return init;
 97:   }
 98: 
 99:   if ((int)get_local_id(0) < numVals) {
 100:     smem[ get_local_id(0)] = threadVal;
 101:   }
 102: 
 103:   // First warp will perform reductions across warps
 104:   barrier(CLK_LOCAL_MEM_FENCE);
 105:   if ((get_local_id(0) / 32) == 0) {
 106:     float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init;
 107: 
 108:     for (int i = 32 + get_local_id(0); i < numVals; i += 32) {
 109:       r = reduceOp(r, smem[i]);
 110:     }
 111: 
 112:     smem[get_local_id(0)] = r;
 113:   }
 114: 
 115:   // First thread will perform reductions across the block
 116:   barrier(CLK_LOCAL_MEM_FENCE);
 117: 
 118:   float r = init;
 119:   if (get_local_id(0) == 0) {
 120:     r = smem[0];
 121: 
 122:     int numLanesParticipating = min(numVals, 32);
 123: 
 124:     if (numLanesParticipating == 32) {
 125:       // Unroll for 32 == 32 and numVals >= 32
 126:       // #pragma unroll
 127:       // unrolling by hand, so compiler-independent
 128:       
 129:         r = reduceOp(r, smem[1]);
 130:       
 131:         r = reduceOp(r, smem[2]);
 132:       
 133:         r = reduceOp(r, smem[3]);
 134:       
 135:         r = reduceOp(r, smem[4]);
 136:       
 137:         r = reduceOp(r, smem[5]);
 138:       
 139:         r = reduceOp(r, smem[6]);
 140:       
 141:         r = reduceOp(r, smem[7]);
 142:       
 143:         r = reduceOp(r, smem[8]);
 144:       
 145:         r = reduceOp(r, smem[9]);
 146:       
 147:         r = reduceOp(r, smem[10]);
 148:       
 149:         r = reduceOp(r, smem[11]);
 150:       
 151:         r = reduceOp(r, smem[12]);
 152:       
 153:         r = reduceOp(r, smem[13]);
 154:       
 155:         r = reduceOp(r, smem[14]);
 156:       
 157:         r = reduceOp(r, smem[15]);
 158:       
 159:         r = reduceOp(r, smem[16]);
 160:       
 161:         r = reduceOp(r, smem[17]);
 162:       
 163:         r = reduceOp(r, smem[18]);
 164:       
 165:         r = reduceOp(r, smem[19]);
 166:       
 167:         r = reduceOp(r, smem[20]);
 168:       
 169:         r = reduceOp(r, smem[21]);
 170:       
 171:         r = reduceOp(r, smem[22]);
 172:       
 173:         r = reduceOp(r, smem[23]);
 174:       
 175:         r = reduceOp(r, smem[24]);
 176:       
 177:         r = reduceOp(r, smem[25]);
 178:       
 179:         r = reduceOp(r, smem[26]);
 180:       
 181:         r = reduceOp(r, smem[27]);
 182:       
 183:         r = reduceOp(r, smem[28]);
 184:       
 185:         r = reduceOp(r, smem[29]);
 186:       
 187:         r = reduceOp(r, smem[30]);
 188:       
 189:         r = reduceOp(r, smem[31]);
 190:       
 191:     } else {
 192:       for (int i = 1; i < numLanesParticipating; ++i) {
 193:         r = reduceOp(r, smem[i]);
 194:       }
 195:     }
 196:   }
 197: 
 198:   return r;
 199: }
 200: 
 201: 
 202: 
 203: 
 204: inline int getReduceNoncontigDimSliceIndex() {
 205:   // Each thread handles one slice
 206:   return getLinearBlockId() * THCL_NONCONTIG_REDUCE_BLOCK_SIZE + /*threadIdx.x*/ get_local_id(0);
 207: }
 208: 
 209: // Kernel that handles an entire reduction of a slice of a tensor per each thread
 210: kernel void
 211: THClTensor_reduceNoncontigDim(global TensorInfoCl *out_info,
 212:                               global float *out_data,
 213:                               global TensorInfoCl *in_info,
 214:                               global float *in_data,
 215:                               int reductionStride,
 216:                               int reductionSize,
 217:                               int totalSlices,
 218:                               float init) {
 219:   const int sliceIndex = getReduceNoncontigDimSliceIndex();
 220: 
 221:   if ((int)sliceIndex >= totalSlices) {
 222:     return;
 223:   }
 224: 
 225:   // Each thread picks a point in `out` and `in` for which it is
 226:   // producing the reduction
 227:   const int outOffset =
 228:     IndexToOffset_998_get(sliceIndex, &out_info[0]);
 229:   const int inBaseOffset =
 230:     IndexToOffset_1001_get(sliceIndex, &in_info[0]);
 231: 
 232:   // For each point in reductionSize, reduce into `r`
 233:   int inOffset = inBaseOffset;
 234:   float r = init;
 235: 
 236:   for (int i = 0; (int)i < reductionSize; ++i) {
 237:     r = reduceOp(r, modifyOp(in_data[inOffset]));
 238:     inOffset += reductionStride;
 239:   }
 240: 
 241:   // Write out reduced value
 242:   out_data[outOffset] = r;
 243: }
 244: 
 245: inline int getReduceContigDimSliceIndex() {
 246:   // Each block handles one slice
 247:   return getLinearBlockId();
 248: }
 249: 
 250: // Kernel that handles an entire reduction of a slice of a tensor per
 251: // each block
 252: kernel void
 253: THClTensor_reduceContigDim(global TensorInfoCl *out_info,
 254:                            global float *out_data,
 255:                            global TensorInfoCl *in_info,
 256:                            global float *in_data,
 257:                            int reductionSize,
 258:                            int totalSlices,
 259:                            float init,
 260:                            local float *smem) {
 261:   const int sliceIndex = getReduceContigDimSliceIndex();
 262: 
 263:   if ((int)sliceIndex >= totalSlices) {
 264:     return;
 265:   }
 266: 
 267:   // Get the offset in `out` for the reduction
 268:   const int outOffset =
 269:     IndexToOffset_998_get(sliceIndex, &out_info[0]);
 270: 
 271:   // Get the base offset in `in` for this block's reduction
 272:   const int inBaseOffset =
 273:     IndexToOffset_1001_get(sliceIndex, &in_info[0]);
 274: 
 275:   // Each thread in the block will reduce some subset of elements in
 276:   // the slice. The elements are guaranteed contiguous starting at
 277:   // `inBaseOffset`.
 278:   float r = init;
 279:   for (int i = /*threadIdx.x*/ get_local_id(0); (int)i < reductionSize; i += /*blockDim.x*/ get_local_size(0)) {
 280:     r = reduceOp(r, modifyOp(in_data[inBaseOffset + i]));
 281:   }
 282: 
 283:   // Reduce within the block
 284: //  extern __shared__ float smem[];
 285:   r = reduceBlock(smem, /*blockDim.x*/ get_local_size(0), r, init);
 286: 
 287:   if (/*threadIdx.x*/ get_local_id(0) == 0) {
 288:     // Write out reduced value
 289:     out_data[outOffset] = r;
 290:   }
 291: }
 292: 
 293: 


 Something went wrong with clCreateKernel, OpenCL erorr code -45
 THClReduce.cl build log: 
 unsupported call to function reduceBlock in THClTensor_reduceContigDim
 ____*________****______________________________________*|___________  ==> Sum_forwardTHClReduce.cl build log: 
 unsupported call to function reduceBlock in THClTensor_reduceContigDim
 kernel build error:

 kernel source:
 1: // Threads per thread block
 2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
 3: 
 4: inline float modifyOp(float _in1) {
 5:   float _out;
 6:   float *in1 = &_in1;
 7:   float *out = &_out;
 8:   *out = *in1;
 9:   return _out;
 10: }
 11: 
 12: inline float reduceOp(float _in1, float _in2) {
 13:   // I guess the compiler can sort this stuff out :-P
 14:   float _out;
 15:   float *in1 = &_in1;
 16:   float *in2 = &_in2;
 17:   float *out = &_out;
 18:   *out = *in1 + *in2;
 19:   return _out;
 20: }
 21: 
 22: // kernel argument that defines tensor layout
 23: typedef struct TensorInfoCl {
 24:   // Extracts size/stride information for the kernel.
 25:   // Successive dimensions can be collapsed if the size/strides match
 26:   // up and thus there are no holes between the dimensions. This is used
 27:   // to reduce the complexity of the problem.
 28:   // The optional `reduceDim` indicates a reduction dimension for the
 29:   // given tensor, so that the output size for this dimension will be 1.
 30: 
 31:   int sizes[25];
 32:   int strides[25];
 33:   int offset;
 34:   int dims;
 35: } TensorInfoCl;
 36: // Contiguous tensors of more than one dimension are collapsed down
 37: // to one tensor
 38: 
 39: 
 40: // Translate a linear index for the apply to a float* offset;
 41: // specialized on `Dims` to reduce nvcc compilation time
 42: 
 43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) {
 44:   int offset = info->offset;
 45: 
 46:   // Use static dims
 47: //  for (int i = 1 - 1; i >= 0; --i) {
 48:   int curDimIndex;
 49:   int curDimOffset;
 50:     // bake this in....
 51:     curDimIndex = linearId % info->sizes[0];
 52:     curDimOffset = curDimIndex * info->strides[0];
 53:     offset += curDimOffset;
 54: 
 55:     
 56:   
 57: //  }
 58: 
 59:   return offset;
 60: }
 61: 
 62: 
 63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) {
 64:     return linearId + info->offset;
 65: }
 66: 
 67: inline int IndexToOffset_999_get(int linearId, global const TensorInfoCl *info) {
 68:   int offset = info->offset;
 69: 
 70:   // Use dynamic dims
 71:   for (int i = info->dims - 1; i >= 0; --i) {
 72:     int curDimIndex = linearId % info->sizes[i];
 73:     int curDimOffset = curDimIndex * info->strides[i];
 74:     offset += curDimOffset;
 75: 
 76:     linearId /= info->sizes[i];
 77:   }
 78: 
 79:   return offset;
 80: }
 81: 
 82: inline int getLinearBlockId() {
 83:   return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
 84:     get_group_id(1) * get_num_groups(0) +
 85:     get_group_id(0);
 86: }
 87: 
 88: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will
 89: // return the reduced value
 90: 
 91: inline float reduceBlock( local float* smem,
 92:                    int numVals,
 93:                    float threadVal,
 94:                    float init) {
 95:   if (numVals == 0) {
 96:     return init;
 97:   }
 98: 
 99:   if ((int)get_local_id(0) < numVals) {
 100:     smem[ get_local_id(0)] = threadVal;
 101:   }
 102: 
 103:   // First warp will perform reductions across warps
 104:   barrier(CLK_LOCAL_MEM_FENCE);
 105:   if ((get_local_id(0) / 32) == 0) {
 106:     float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init;
 107: 
 108:     for (int i = 32 + get_local_id(0); i < numVals; i += 32) {
 109:       r = reduceOp(r, smem[i]);
 110:     }
 111: 
 112:     smem[get_local_id(0)] = r;
 113:   }
 114: 
 115:   // First thread will perform reductions across the block
 116:   barrier(CLK_LOCAL_MEM_FENCE);
 117: 
 118:   float r = init;
 119:   if (get_local_id(0) == 0) {
 120:     r = smem[0];
 121: 
 122:     int numLanesParticipating = min(numVals, 32);
 123: 
 124:     if (numLanesParticipating == 32) {
 125:       // Unroll for 32 == 32 and numVals >= 32
 126:       // #pragma unroll
 127:       // unrolling by hand, so compiler-independent
 128:       
 129:         r = reduceOp(r, smem[1]);
 130:       
 131:         r = reduceOp(r, smem[2]);
 132:       
 133:         r = reduceOp(r, smem[3]);
 134:       
 135:         r = reduceOp(r, smem[4]);
 136:       
 137:         r = reduceOp(r, smem[5]);
 138:       
 139:         r = reduceOp(r, smem[6]);
 140:       
 141:         r = reduceOp(r, smem[7]);
 142:       
 143:         r = reduceOp(r, smem[8]);
 144:       
 145:         r = reduceOp(r, smem[9]);
 146:       
 147:         r = reduceOp(r, smem[10]);
 148:       
 149:         r = reduceOp(r, smem[11]);
 150:       
 151:         r = reduceOp(r, smem[12]);
 152:       
 153:         r = reduceOp(r, smem[13]);
 154:       
 155:         r = reduceOp(r, smem[14]);
 156:       
 157:         r = reduceOp(r, smem[15]);
 158:       
 159:         r = reduceOp(r, smem[16]);
 160:       
 161:         r = reduceOp(r, smem[17]);
 162:       
 163:         r = reduceOp(r, smem[18]);
 164:       
 165:         r = reduceOp(r, smem[19]);
 166:       
 167:         r = reduceOp(r, smem[20]);
 168:       
 169:         r = reduceOp(r, smem[21]);
 170:       
 171:         r = reduceOp(r, smem[22]);
 172:       
 173:         r = reduceOp(r, smem[23]);
 174:       
 175:         r = reduceOp(r, smem[24]);
 176:       
 177:         r = reduceOp(r, smem[25]);
 178:       
 179:         r = reduceOp(r, smem[26]);
 180:       
 181:         r = reduceOp(r, smem[27]);
 182:       
 183:         r = reduceOp(r, smem[28]);
 184:       
 185:         r = reduceOp(r, smem[29]);
 186:       
 187:         r = reduceOp(r, smem[30]);
 188:       
 189:         r = reduceOp(r, smem[31]);
 190:       
 191:     } else {
 192:       for (int i = 1; i < numLanesParticipating; ++i) {
 193:         r = reduceOp(r, smem[i]);
 194:       }
 195:     }
 196:   }
 197: 
 198:   return r;
 199: }
 200: 
 201: 
 202: 
 203: 
 204: inline int getReduceNoncontigDimSliceIndex() {
 205:   // Each thread handles one slice
 206:   return getLinearBlockId() * THCL_NONCONTIG_REDUCE_BLOCK_SIZE + /*threadIdx.x*/ get_local_id(0);
 207: }
 208: 
 209: // Kernel that handles an entire reduction of a slice of a tensor per each thread
 210: kernel void
 211: THClTensor_reduceNoncontigDim(global TensorInfoCl *out_info,
 212:                               global float *out_data,
 213:                               global TensorInfoCl *in_info,
 214:                               global float *in_data,
 215:                               int reductionStride,
 216:                               int reductionSize,
 217:                               int totalSlices,
 218:                               float init) {
 219:   const int sliceIndex = getReduceNoncontigDimSliceIndex();
 220: 
 221:   if ((int)sliceIndex >= totalSlices) {
 222:     return;
 223:   }
 224: 
 225:   // Each thread picks a point in `out` and `in` for which it is
 226:   // producing the reduction
 227:   const int outOffset =
 228:     IndexToOffset_998_get(sliceIndex, &out_info[0]);
 229:   const int inBaseOffset =
 230:     IndexToOffset_1001_get(sliceIndex, &in_info[0]);
 231: 
 232:   // For each point in reductionSize, reduce into `r`
 233:   int inOffset = inBaseOffset;
 234:   float r = init;
 235: 
 236:   for (int i = 0; (int)i < reductionSize; ++i) {
 237:     r = reduceOp(r, modifyOp(in_data[inOffset]));
 238:     inOffset += reductionStride;
 239:   }
 240: 
 241:   // Write out reduced value
 242:   out_data[outOffset] = r;
 243: }
 244: 
 245: inline int getReduceContigDimSliceIndex() {
 246:   // Each block handles one slice
 247:   return getLinearBlockId();
 248: }
 249: 
 250: // Kernel that handles an entire reduction of a slice of a tensor per
 251: // each block
 252: kernel void
 253: THClTensor_reduceContigDim(global TensorInfoCl *out_info,
 254:                            global float *out_data,
 255:                            global TensorInfoCl *in_info,
 256:                            global float *in_data,
 257:                            int reductionSize,
 258:                            int totalSlices,
 259:                            float init,
 260:                            local float *smem) {
 261:   const int sliceIndex = getReduceContigDimSliceIndex();
 262: 
 263:   if ((int)sliceIndex >= totalSlices) {
 264:     return;
 265:   }
 266: 
 267:   // Get the offset in `out` for the reduction
 268:   const int outOffset =
 269:     IndexToOffset_998_get(sliceIndex, &out_info[0]);
 270: 
 271:   // Get the base offset in `in` for this block's reduction
 272:   const int inBaseOffset =
 273:     IndexToOffset_1001_get(sliceIndex, &in_info[0]);
 274: 
 275:   // Each thread in the block will reduce some subset of elements in
 276:   // the slice. The elements are guaranteed contiguous starting at
 277:   // `inBaseOffset`.
 278:   float r = init;
 279:   for (int i = /*threadIdx.x*/ get_local_id(0); (int)i < reductionSize; i += /*blockDim.x*/ get_local_size(0)) {
 280:     r = reduceOp(r, modifyOp(in_data[inBaseOffset + i]));
 281:   }
 282: 
 283:   // Reduce within the block
 284: //  extern __shared__ float smem[];
 285:   r = reduceBlock(smem, /*blockDim.x*/ get_local_size(0), r, init);
 286: 
 287:   if (/*threadIdx.x*/ get_local_id(0) == 0) {
 288:     // Write out reduced value
 289:     out_data[outOffset] = r;
 290:   }
 291: }
 292: 
 293: 


 Something went wrong with clCreateKernel, OpenCL erorr code -45
 THClReduce.cl build log: 
 unsupported call to function reduceBlock in THClTensor_reduceContigDim
 ____*________****______________________________________**|__________  ==> Tanh_backwardApply_2t_0s_0pt_-2_-2_*out = tanh( *in1 ) build log: 
 input.cl:35:12: warning: implicit declaration of function 'tanh' is invalid in C99
 unsupported call to function tanh in THClTensor_pointwiseApplyD
 kernel build error:

 kernel source:
 1: // OpenCL kernels....
 2: 
 3: // expected templated values:
 4: // dims (vector of unique dimension values)
 5: // operation
 6: // dim1
 7: // dim2
 8: // dim3
 9: // ... dimD
 10: // num_input_tensors
 11: // include_scalar_input
 12: //
 13: // maybe should add:
 14: // IndexType (hardcoded to int for now)
 15: // MAX_CUTORCH_DIMS (hardcoded to 25 for now)
 16: 
 17: // (Ported from cutorch's THCApply.cuh)
 18: 
 19: // Maximum number of dimensions allowed for cutorch
 20: // #define MAX_CUTORCH_DIMS 25
 21: 
 22: // Enum that indicates whether tensor arguments are read/write or
 23: // read-only
 24: //enum TensorArgType { ReadWrite, ReadOnly };
 25: 
 26: 
 27: 
 28: inline void op( global float *out
 29:   
 30:   , global float *in1
 31:   
 32:   
 33:    
 34: ) {
 35:     *out = tanh( *in1 );
 36: }
 37: 
 38: kernel void
 39: THClTensor_pointwiseApplyD(
 40:    
 41:     int offset_1,
 42:     
 43:     
 44:     global float*data_1,
 45:    
 46:     int offset_2,
 47:     
 48:     
 49:     global float*data_2,
 50:    
 51:    
 52:    
 53:    int totalElements) {
 54:    int linearIndex = get_global_id(0);
 55:    if(linearIndex < totalElements ) {
 56:     
 57:     
 58:       
 59:       
 60:          int derived_offset_1 = linearIndex + offset_1;
 61:       
 62:     
 63:       
 64:       
 65:          int derived_offset_2 = linearIndex + offset_2;
 66:       
 67:     
 68: 
 69:     op(
 70:       
 71:          
 72:          &(data_1[derived_offset_1])
 73:       
 74:           , 
 75:          &(data_2[derived_offset_2])
 76:       
 77: 
 78:       
 79: 
 80:        
 81:     );
 82:   }
 83: }
 84: 
 85: 


 Something went wrong with clCreateKernel, OpenCL erorr code -45
 Apply_2t_0s_0pt_-2_-2_*out = tanh( *in1 ) build log: 
 input.cl:35:12: warning: implicit declaration of function 'tanh' is invalid in C99
 unsupported call to function tanh in THClTensor_pointwiseApplyD
 ____*________****______________________________________***|_________  ==> Tanh_forwardApply_2t_0s_0pt_-2_-2_*out = tanh( *in1 ) build log: 
 input.cl:35:12: warning: implicit declaration of function 'tanh' is invalid in C99
 unsupported call to function tanh in THClTensor_pointwiseApplyD
 kernel build error:

 kernel source:
 1: // OpenCL kernels....
 2: 
 3: // expected templated values:
 4: // dims (vector of unique dimension values)
 5: // operation
 6: // dim1
 7: // dim2
 8: // dim3
 9: // ... dimD
 10: // num_input_tensors
 11: // include_scalar_input
 12: //
 13: // maybe should add:
 14: // IndexType (hardcoded to int for now)
 15: // MAX_CUTORCH_DIMS (hardcoded to 25 for now)
 16: 
 17: // (Ported from cutorch's THCApply.cuh)
 18: 
 19: // Maximum number of dimensions allowed for cutorch
 20: // #define MAX_CUTORCH_DIMS 25
 21: 
 22: // Enum that indicates whether tensor arguments are read/write or
 23: // read-only
 24: //enum TensorArgType { ReadWrite, ReadOnly };
 25: 
 26: 
 27: 
 28: inline void op( global float *out
 29:   
 30:   , global float *in1
 31:   
 32:   
 33:    
 34: ) {
 35:     *out = tanh( *in1 );
 36: }
 37: 
 38: kernel void
 39: THClTensor_pointwiseApplyD(
 40:    
 41:     int offset_1,
 42:     
 43:     
 44:     global float*data_1,
 45:    
 46:     int offset_2,
 47:     
 48:     
 49:     global float*data_2,
 50:    
 51:    
 52:    
 53:    int totalElements) {
 54:    int linearIndex = get_global_id(0);
 55:    if(linearIndex < totalElements ) {
 56:     
 57:     
 58:       
 59:       
 60:          int derived_offset_1 = linearIndex + offset_1;
 61:       
 62:     
 63:       
 64:       
 65:          int derived_offset_2 = linearIndex + offset_2;
 66:       
 67:     
 68: 
 69:     op(
 70:       
 71:          
 72:          &(data_1[derived_offset_1])
 73:       
 74:           , 
 75:          &(data_2[derived_offset_2])
 76:       
 77: 
 78:       
 79: 
 80:        
 81:     );
 82:   }
 83: }
 84: 
 85: 


 Something went wrong with clCreateKernel, OpenCL erorr code -45
 Apply_2t_0s_0pt_-2_-2_*out = tanh( *in1 ) build log: 
 input.cl:35:12: warning: implicit declaration of function 'tanh' is invalid in C99
 unsupported call to function tanh in THClTensor_pointwiseApplyD
 ____*________****______________________________________****|________  ==> Tanh_transposedApply_2t_0s_0pt_-2_2_*out = tanh( *in1 ) build log: 
 input.cl:35:12: warning: implicit declaration of function 'tanh' is invalid in C99
 unsupported call to function tanh in THClTensor_pointwiseApplyD
 kernel build error:

 kernel source:
 1: // OpenCL kernels....
 2: 
 3: // expected templated values:
 4: // dims (vector of unique dimension values)
 5: // operation
 6: // dim1
 7: // dim2
 8: // dim3
 9: // ... dimD
 10: // num_input_tensors
 11: // include_scalar_input
 12: //
 13: // maybe should add:
 14: // IndexType (hardcoded to int for now)
 15: // MAX_CUTORCH_DIMS (hardcoded to 25 for now)
 16: 
 17: // (Ported from cutorch's THCApply.cuh)
 18: 
 19: // Maximum number of dimensions allowed for cutorch
 20: // #define MAX_CUTORCH_DIMS 25
 21: 
 22: // Enum that indicates whether tensor arguments are read/write or
 23: // read-only
 24: //enum TensorArgType { ReadWrite, ReadOnly };
 25: 
 26: 
 27: 
 28: inline void op( global float *out
 29:   
 30:   , global float *in1
 31:   
 32:   
 33:    
 34: ) {
 35:     *out = tanh( *in1 );
 36: }
 37: 
 38: kernel void
 39: THClTensor_pointwiseApplyD(
 40:    
 41:     int offset_1,
 42:     
 43:     
 44:     global float*data_1,
 45:    
 46:     int offset_2,
 47:     
 48:     
 49:       int size_2_1,
 50:       int stride_2_1,
 51:     
 52:       int size_2_2,
 53:       int stride_2_2,
 54:     
 55:     global float*data_2,
 56:    
 57:    
 58:    
 59:    int totalElements) {
 60:    int linearIndex = get_global_id(0);
 61:    if(linearIndex < totalElements ) {
 62:     
 63:     int thisLinearId;
 64:     
 65:     
 66:       
 67:       
 68:          int derived_offset_1 = linearIndex + offset_1;
 69:       
 70:     
 71:       
 72:       
 73:          unsigned int derived_offset_2 = offset_2;
 74:          thisLinearId = linearIndex;
 75:           // bake this in....
 76:           derived_offset_2 += (thisLinearId % size_2_2) * stride_2_2;
 77:           
 78:             thisLinearId /= size_2_2;
 79:           
 80:           // bake this in....
 81:           derived_offset_2 += (thisLinearId % size_2_1) * stride_2_1;
 82:           
 83:             thisLinearId /= size_2_1;
 84:           
 85:         
 86: 
 87:       
 88:     
 89: 
 90:     op(
 91:       
 92:          
 93:          &(data_1[derived_offset_1])
 94:       
 95:           , 
 96:          &(data_2[derived_offset_2])
 97:       
 98: 
 99:       
 100: 
 101:        
 102:     );
 103:   }
 104: }
 105: 
 106: 


 Something went wrong with clCreateKernel, OpenCL erorr code -45
 Apply_2t_0s_0pt_-2_2_*out = tanh( *in1 ) build log: 
 input.cl:35:12: warning: implicit declaration of function 'tanh' is invalid in C99
 unsupported call to function tanh in THClTensor_pointwiseApplyD
 ____*________****______________________________________*****_____|__  ==> mseTHClReduceAll.cl build log: 
 unsupported call to function reduceBlock in THClTensor_reduceAll
 kernel build error:

 kernel source:
 1: inline unsigned int THClCeilDiv(unsigned int a, unsigned int b) {
 2:   return (a + b - 1) / b;
 3: }
 4: 
 5: 
 6: 
 7: inline float modifyOp(float _in1) {
 8:   float _out;
 9:   float *in1 = &_in1;
 10:   float *out = &_out;
 11:   *out = *in1;
 12:   return _out;
 13: }
 14: 
 15: inline float reduceOp(float _in1, float _in2) {
 16:   // I guess the compiler can sort this stuff out :-P
 17:   float _out;
 18:   float *in1 = &_in1;
 19:   float *in2 = &_in2;
 20:   float *out = &_out;
 21:   *out = *in1 + *in2;
 22:   return _out;
 23: }
 24: 
 25: // kernel argument that defines tensor layout
 26: typedef struct TensorInfoCl {
 27:   // Extracts size/stride information for the kernel.
 28:   // Successive dimensions can be collapsed if the size/strides match
 29:   // up and thus there are no holes between the dimensions. This is used
 30:   // to reduce the complexity of the problem.
 31:   // The optional `reduceDim` indicates a reduction dimension for the
 32:   // given tensor, so that the output size for this dimension will be 1.
 33: 
 34:   unsigned int sizes[25];
 35:   unsigned int strides[25];
 36:   unsigned int offset;
 37:   int dims;
 38: } TensorInfoCl;
 39: // Contiguous tensors of more than one dimension are collapsed down
 40: // to one tensor
 41: 
 42: 
 43: // Translate a linear index for the apply to a float* offset;
 44: // specialized on `Dims` to reduce nvcc compilation time
 45: 
 46: 
 47: inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) {
 48:     return linearId + info->offset;
 49: }
 50: 
 51: inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) {
 52:   unsigned int offset = info->offset;
 53: 
 54:   // Use dynamic dims
 55:   for (int i = info->dims - 1; i >= 0; --i) {
 56:     unsigned int curDimIndex = linearId % info->sizes[i];
 57:     unsigned int curDimOffset = curDimIndex * info->strides[i];
 58:     offset += curDimOffset;
 59: 
 60:     linearId /= info->sizes[i];
 61:   }
 62: 
 63:   return offset;
 64: }
 65: 
 66: inline unsigned int getLinearBlockId() {
 67:   return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
 68:     get_group_id(1) * get_num_groups(0) +
 69:     get_group_id(0);
 70: }
 71: 
 72: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will
 73: // return the reduced value
 74: 
 75: inline float reduceBlock( local float* smem,
 76:                    int numVals,
 77:                    float threadVal,
 78:                    float init) {
 79:   if (numVals == 0) {
 80:     return init;
 81:   }
 82: 
 83:   if ((int)get_local_id(0) < numVals) {
 84:     smem[ get_local_id(0)] = threadVal;
 85:   }
 86: 
 87:   // First warp will perform reductions across warps
 88:   barrier(CLK_LOCAL_MEM_FENCE);
 89:   if ((get_local_id(0) / 32) == 0) {
 90:     float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init;
 91: 
 92:     for (int i = 32 + get_local_id(0); i < numVals; i += 32) {
 93:       r = reduceOp(r, smem[i]);
 94:     }
 95: 
 96:     smem[get_local_id(0)] = r;
 97:   }
 98: 
 99:   // First thread will perform reductions across the block
 100:   barrier(CLK_LOCAL_MEM_FENCE);
 101: 
 102:   float r = init;
 103:   if (get_local_id(0) == 0) {
 104:     r = smem[0];
 105: 
 106:     int numLanesParticipating = min(numVals, 32);
 107: 
 108:     if (numLanesParticipating == 32) {
 109:       // Unroll for 32 == 32 and numVals >= 32
 110:       // #pragma unroll
 111:       // unrolling by hand, so compiler-independent
 112:       
 113:         r = reduceOp(r, smem[1]);
 114:       
 115:         r = reduceOp(r, smem[2]);
 116:       
 117:         r = reduceOp(r, smem[3]);
 118:       
 119:         r = reduceOp(r, smem[4]);
 120:       
 121:         r = reduceOp(r, smem[5]);
 122:       
 123:         r = reduceOp(r, smem[6]);
 124:       
 125:         r = reduceOp(r, smem[7]);
 126:       
 127:         r = reduceOp(r, smem[8]);
 128:       
 129:         r = reduceOp(r, smem[9]);
 130:       
 131:         r = reduceOp(r, smem[10]);
 132:       
 133:         r = reduceOp(r, smem[11]);
 134:       
 135:         r = reduceOp(r, smem[12]);
 136:       
 137:         r = reduceOp(r, smem[13]);
 138:       
 139:         r = reduceOp(r, smem[14]);
 140:       
 141:         r = reduceOp(r, smem[15]);
 142:       
 143:         r = reduceOp(r, smem[16]);
 144:       
 145:         r = reduceOp(r, smem[17]);
 146:       
 147:         r = reduceOp(r, smem[18]);
 148:       
 149:         r = reduceOp(r, smem[19]);
 150:       
 151:         r = reduceOp(r, smem[20]);
 152:       
 153:         r = reduceOp(r, smem[21]);
 154:       
 155:         r = reduceOp(r, smem[22]);
 156:       
 157:         r = reduceOp(r, smem[23]);
 158:       
 159:         r = reduceOp(r, smem[24]);
 160:       
 161:         r = reduceOp(r, smem[25]);
 162:       
 163:         r = reduceOp(r, smem[26]);
 164:       
 165:         r = reduceOp(r, smem[27]);
 166:       
 167:         r = reduceOp(r, smem[28]);
 168:       
 169:         r = reduceOp(r, smem[29]);
 170:       
 171:         r = reduceOp(r, smem[30]);
 172:       
 173:         r = reduceOp(r, smem[31]);
 174:       
 175:     } else {
 176:       for (int i = 1; i < numLanesParticipating; ++i) {
 177:         r = reduceOp(r, smem[i]);
 178:       }
 179:     }
 180:   }
 181: 
 182:   return r;
 183: }
 184: 
 185: 
 186: 
 187: 
 188: // Kernel that handles an entire reduction of a tensor in one pass
 189: kernel void
 190: THClTensor_reduceAll(global TensorInfoCl *in_info,
 191:                      global float *in_data,
 192:                      unsigned int totalElements,
 193:                      float init,
 194:                      global float* out,
 195:                      local float *smem) {
 196:   // With a block-wide stride, have each thread perform its own reduction.
 197:   float r = init;
 198:   for (unsigned int i = get_local_id(0); i < totalElements; i += get_local_size(0)) {
 199:     const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]);
 200:     r = reduceOp(r, modifyOp(in_data[inOffset]));
 201:   }
 202: 
 203:   // Reduce within the block
 204:   r = reduceBlock(smem, get_local_size(0), r, init);
 205: 
 206:   if(get_local_id(0) == 0) {
 207:     // Write out reduced value
 208:     out[0] = r;
 209:   }
 210: }
 211: 
 212: inline unsigned int getStartIndex(unsigned int totalSize) {
 213:   unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0));
 214:   return get_group_id(0) * sizePerBlock;
 215: }
 216: 
 217: inline unsigned int getEndIndex(unsigned int totalSize) {
 218:   unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0));
 219:   return min((unsigned int) ((get_group_id(0) + 1) * sizePerBlock), totalSize);
 220: }
 221: 
 222: // Kernel that handles an entire reduction of a tensor in two passes
 223: kernel void
 224: THClTensor_reduceAllPass1(global TensorInfoCl *in_info,
 225:                           global float *in_data,
 226:                           unsigned int totalElements,
 227:                           float init,
 228:                           global float* scratchSpace,
 229:                           local float *smem) {
 230:   const unsigned int startIndex = getStartIndex(totalElements);
 231:   const unsigned int endIndex = getEndIndex(totalElements);
 232: 
 233:   // With a block-wide stride, have each thread perform its own reduction.
 234:   float r = init;
 235:   for (unsigned int i = startIndex + get_local_id(0); i < endIndex; i += get_local_size(0)) {
 236:     const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]);
 237:     r = reduceOp(r, modifyOp(in_data[inOffset]));
 238:   }
 239: 
 240:   // Reduce within the block
 241:   r = reduceBlock(smem, get_local_size(0), r, init);
 242: 
 243:   if ((int)get_local_id(0) == 0) {
 244:     // Write out block-wide reduced value
 245:     scratchSpace[get_group_id(0)] = r;
 246:   }
 247: }
 248: 
 249: kernel void THClTensor_reduceAllPass2(int numPass1Blocks,
 250:                             float init,
 251:                             global float* scratchSpace,
 252:                             global float* out,
 253:                             local float *smem) {
 254:   float r = init;
 255:   if ((int)get_local_id(0) < numPass1Blocks) {
 256:     r = scratchSpace[get_local_id(0)];
 257:   }
 258: 
 259:   // Reduce within the block
 260:   r = reduceBlock(smem, numPass1Blocks, r, init);
 261: 
 262:   if((int)get_local_id(0) == 0) {
 263:     out[0] = r;
 264:   }
 265: }
 266: 
 267: 
 268: 
 269: 


 Something went wrong with clCreateKernel, OpenCL erorr code -45
 THClReduceAll.cl build log: 
 unsupported call to function reduceBlock in THClTensor_reduceAll
 ____*________****______________________________________*****_____*|_  ==> mse_nosizeaverageTHClReduceAll.cl build log: 
 unsupported call to function reduceBlock in THClTensor_reduceAll
 kernel build error:

 kernel source:
 1: inline unsigned int THClCeilDiv(unsigned int a, unsigned int b) {
 2:   return (a + b - 1) / b;
 3: }
 4: 
 5: 
 6: 
 7: inline float modifyOp(float _in1) {
 8:   float _out;
 9:   float *in1 = &_in1;
 10:   float *out = &_out;
 11:   *out = *in1;
 12:   return _out;
 13: }
 14: 
 15: inline float reduceOp(float _in1, float _in2) {
 16:   // I guess the compiler can sort this stuff out :-P
 17:   float _out;
 18:   float *in1 = &_in1;
 19:   float *in2 = &_in2;
 20:   float *out = &_out;
 21:   *out = *in1 + *in2;
 22:   return _out;
 23: }
 24: 
 25: // kernel argument that defines tensor layout
 26: typedef struct TensorInfoCl {
 27:   // Extracts size/stride information for the kernel.
 28:   // Successive dimensions can be collapsed if the size/strides match
 29:   // up and thus there are no holes between the dimensions. This is used
 30:   // to reduce the complexity of the problem.
 31:   // The optional `reduceDim` indicates a reduction dimension for the
 32:   // given tensor, so that the output size for this dimension will be 1.
 33: 
 34:   unsigned int sizes[25];
 35:   unsigned int strides[25];
 36:   unsigned int offset;
 37:   int dims;
 38: } TensorInfoCl;
 39: // Contiguous tensors of more than one dimension are collapsed down
 40: // to one tensor
 41: 
 42: 
 43: // Translate a linear index for the apply to a float* offset;
 44: // specialized on `Dims` to reduce nvcc compilation time
 45: 
 46: 
 47: inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) {
 48:     return linearId + info->offset;
 49: }
 50: 
 51: inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) {
 52:   unsigned int offset = info->offset;
 53: 
 54:   // Use dynamic dims
 55:   for (int i = info->dims - 1; i >= 0; --i) {
 56:     unsigned int curDimIndex = linearId % info->sizes[i];
 57:     unsigned int curDimOffset = curDimIndex * info->strides[i];
 58:     offset += curDimOffset;
 59: 
 60:     linearId /= info->sizes[i];
 61:   }
 62: 
 63:   return offset;
 64: }
 65: 
 66: inline unsigned int getLinearBlockId() {
 67:   return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
 68:     get_group_id(1) * get_num_groups(0) +
 69:     get_group_id(0);
 70: }
 71: 
 72: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will
 73: // return the reduced value
 74: 
 75: inline float reduceBlock( local float* smem,
 76:                    int numVals,
 77:                    float threadVal,
 78:                    float init) {
 79:   if (numVals == 0) {
 80:     return init;
 81:   }
 82: 
 83:   if ((int)get_local_id(0) < numVals) {
 84:     smem[ get_local_id(0)] = threadVal;
 85:   }
 86: 
 87:   // First warp will perform reductions across warps
 88:   barrier(CLK_LOCAL_MEM_FENCE);
 89:   if ((get_local_id(0) / 32) == 0) {
 90:     float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init;
 91: 
 92:     for (int i = 32 + get_local_id(0); i < numVals; i += 32) {
 93:       r = reduceOp(r, smem[i]);
 94:     }
 95: 
 96:     smem[get_local_id(0)] = r;
 97:   }
 98: 
 99:   // First thread will perform reductions across the block
 100:   barrier(CLK_LOCAL_MEM_FENCE);
 101: 
 102:   float r = init;
 103:   if (get_local_id(0) == 0) {
 104:     r = smem[0];
 105: 
 106:     int numLanesParticipating = min(numVals, 32);
 107: 
 108:     if (numLanesParticipating == 32) {
 109:       // Unroll for 32 == 32 and numVals >= 32
 110:       // #pragma unroll
 111:       // unrolling by hand, so compiler-independent
 112:       
 113:         r = reduceOp(r, smem[1]);
 114:       
 115:         r = reduceOp(r, smem[2]);
 116:       
 117:         r = reduceOp(r, smem[3]);
 118:       
 119:         r = reduceOp(r, smem[4]);
 120:       
 121:         r = reduceOp(r, smem[5]);
 122:       
 123:         r = reduceOp(r, smem[6]);
 124:       
 125:         r = reduceOp(r, smem[7]);
 126:       
 127:         r = reduceOp(r, smem[8]);
 128:       
 129:         r = reduceOp(r, smem[9]);
 130:       
 131:         r = reduceOp(r, smem[10]);
 132:       
 133:         r = reduceOp(r, smem[11]);
 134:       
 135:         r = reduceOp(r, smem[12]);
 136:       
 137:         r = reduceOp(r, smem[13]);
 138:       
 139:         r = reduceOp(r, smem[14]);
 140:       
 141:         r = reduceOp(r, smem[15]);
 142:       
 143:         r = reduceOp(r, smem[16]);
 144:       
 145:         r = reduceOp(r, smem[17]);
 146:       
 147:         r = reduceOp(r, smem[18]);
 148:       
 149:         r = reduceOp(r, smem[19]);
 150:       
 151:         r = reduceOp(r, smem[20]);
 152:       
 153:         r = reduceOp(r, smem[21]);
 154:       
 155:         r = reduceOp(r, smem[22]);
 156:       
 157:         r = reduceOp(r, smem[23]);
 158:       
 159:         r = reduceOp(r, smem[24]);
 160:       
 161:         r = reduceOp(r, smem[25]);
 162:       
 163:         r = reduceOp(r, smem[26]);
 164:       
 165:         r = reduceOp(r, smem[27]);
 166:       
 167:         r = reduceOp(r, smem[28]);
 168:       
 169:         r = reduceOp(r, smem[29]);
 170:       
 171:         r = reduceOp(r, smem[30]);
 172:       
 173:         r = reduceOp(r, smem[31]);
 174:       
 175:     } else {
 176:       for (int i = 1; i < numLanesParticipating; ++i) {
 177:         r = reduceOp(r, smem[i]);
 178:       }
 179:     }
 180:   }
 181: 
 182:   return r;
 183: }
 184: 
 185: 
 186: 
 187: 
 188: // Kernel that handles an entire reduction of a tensor in one pass
 189: kernel void
 190: THClTensor_reduceAll(global TensorInfoCl *in_info,
 191:                      global float *in_data,
 192:                      unsigned int totalElements,
 193:                      float init,
 194:                      global float* out,
 195:                      local float *smem) {
 196:   // With a block-wide stride, have each thread perform its own reduction.
 197:   float r = init;
 198:   for (unsigned int i = get_local_id(0); i < totalElements; i += get_local_size(0)) {
 199:     const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]);
 200:     r = reduceOp(r, modifyOp(in_data[inOffset]));
 201:   }
 202: 
 203:   // Reduce within the block
 204:   r = reduceBlock(smem, get_local_size(0), r, init);
 205: 
 206:   if(get_local_id(0) == 0) {
 207:     // Write out reduced value
 208:     out[0] = r;
 209:   }
 210: }
 211: 
 212: inline unsigned int getStartIndex(unsigned int totalSize) {
 213:   unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0));
 214:   return get_group_id(0) * sizePerBlock;
 215: }
 216: 
 217: inline unsigned int getEndIndex(unsigned int totalSize) {
 218:   unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0));
 219:   return min((unsigned int) ((get_group_id(0) + 1) * sizePerBlock), totalSize);
 220: }
 221: 
 222: // Kernel that handles an entire reduction of a tensor in two passes
 223: kernel void
 224: THClTensor_reduceAllPass1(global TensorInfoCl *in_info,
 225:                           global float *in_data,
 226:                           unsigned int totalElements,
 227:                           float init,
 228:                           global float* scratchSpace,
 229:                           local float *smem) {
 230:   const unsigned int startIndex = getStartIndex(totalElements);
 231:   const unsigned int endIndex = getEndIndex(totalElements);
 232: 
 233:   // With a block-wide stride, have each thread perform its own reduction.
 234:   float r = init;
 235:   for (unsigned int i = startIndex + get_local_id(0); i < endIndex; i += get_local_size(0)) {
 236:     const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]);
 237:     r = reduceOp(r, modifyOp(in_data[inOffset]));
 238:   }
 239: 
 240:   // Reduce within the block
 241:   r = reduceBlock(smem, get_local_size(0), r, init);
 242: 
 243:   if ((int)get_local_id(0) == 0) {
 244:     // Write out block-wide reduced value
 245:     scratchSpace[get_group_id(0)] = r;
 246:   }
 247: }
 248: 
 249: kernel void THClTensor_reduceAllPass2(int numPass1Blocks,
 250:                             float init,
 251:                             global float* scratchSpace,
 252:                             global float* out,
 253:                             local float *smem) {
 254:   float r = init;
 255:   if ((int)get_local_id(0) < numPass1Blocks) {
 256:     r = scratchSpace[get_local_id(0)];
 257:   }
 258: 
 259:   // Reduce within the block
 260:   r = reduceBlock(smem, numPass1Blocks, r, init);
 261: 
 262:   if((int)get_local_id(0) == 0) {
 263:     out[0] = r;
 264:   }
 265: }
 266: 
 267: 
 268: 
 269: 


 Something went wrong with clCreateKernel, OpenCL erorr code -45
 THClReduceAll.cl build log: 
 unsupported call to function reduceBlock in THClTensor_reduceAll
 ____*________****______________________________________*****_____**|  ==> mse_variablebatchsizeTHClReduceAll.cl build log: 
 unsupported call to function reduceBlock in THClTensor_reduceAll
 kernel build error:

 kernel source:
 1: inline unsigned int THClCeilDiv(unsigned int a, unsigned int b) {
 2:   return (a + b - 1) / b;
 3: }
 4: 
 5: 
 6: 
 7: inline float modifyOp(float _in1) {
 8:   float _out;
 9:   float *in1 = &_in1;
 10:   float *out = &_out;
 11:   *out = *in1;
 12:   return _out;
 13: }
 14: 
 15: inline float reduceOp(float _in1, float _in2) {
 16:   // I guess the compiler can sort this stuff out :-P
 17:   float _out;
 18:   float *in1 = &_in1;
 19:   float *in2 = &_in2;
 20:   float *out = &_out;
 21:   *out = *in1 + *in2;
 22:   return _out;
 23: }
 24: 
 25: // kernel argument that defines tensor layout
 26: typedef struct TensorInfoCl {
 27:   // Extracts size/stride information for the kernel.
 28:   // Successive dimensions can be collapsed if the size/strides match
 29:   // up and thus there are no holes between the dimensions. This is used
 30:   // to reduce the complexity of the problem.
 31:   // The optional `reduceDim` indicates a reduction dimension for the
 32:   // given tensor, so that the output size for this dimension will be 1.
 33: 
 34:   unsigned int sizes[25];
 35:   unsigned int strides[25];
 36:   unsigned int offset;
 37:   int dims;
 38: } TensorInfoCl;
 39: // Contiguous tensors of more than one dimension are collapsed down
 40: // to one tensor
 41: 
 42: 
 43: // Translate a linear index for the apply to a float* offset;
 44: // specialized on `Dims` to reduce nvcc compilation time
 45: 
 46: 
 47: inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) {
 48:     return linearId + info->offset;
 49: }
 50: 
 51: inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) {
 52:   unsigned int offset = info->offset;
 53: 
 54:   // Use dynamic dims
 55:   for (int i = info->dims - 1; i >= 0; --i) {
 56:     unsigned int curDimIndex = linearId % info->sizes[i];
 57:     unsigned int curDimOffset = curDimIndex * info->strides[i];
 58:     offset += curDimOffset;
 59: 
 60:     linearId /= info->sizes[i];
 61:   }
 62: 
 63:   return offset;
 64: }
 65: 
 66: inline unsigned int getLinearBlockId() {
 67:   return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
 68:     get_group_id(1) * get_num_groups(0) +
 69:     get_group_id(0);
 70: }
 71: 
 72: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will
 73: // return the reduced value
 74: 
 75: inline float reduceBlock( local float* smem,
 76:                    int numVals,
 77:                    float threadVal,
 78:                    float init) {
 79:   if (numVals == 0) {
 80:     return init;
 81:   }
 82: 
 83:   if ((int)get_local_id(0) < numVals) {
 84:     smem[ get_local_id(0)] = threadVal;
 85:   }
 86: 
 87:   // First warp will perform reductions across warps
 88:   barrier(CLK_LOCAL_MEM_FENCE);
 89:   if ((get_local_id(0) / 32) == 0) {
 90:     float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init;
 91: 
 92:     for (int i = 32 + get_local_id(0); i < numVals; i += 32) {
 93:       r = reduceOp(r, smem[i]);
 94:     }
 95: 
 96:     smem[get_local_id(0)] = r;
 97:   }
 98: 
 99:   // First thread will perform reductions across the block
 100:   barrier(CLK_LOCAL_MEM_FENCE);
 101: 
 102:   float r = init;
 103:   if (get_local_id(0) == 0) {
 104:     r = smem[0];
 105: 
 106:     int numLanesParticipating = min(numVals, 32);
 107: 
 108:     if (numLanesParticipating == 32) {
 109:       // Unroll for 32 == 32 and numVals >= 32
 110:       // #pragma unroll
 111:       // unrolling by hand, so compiler-independent
 112:       
 113:         r = reduceOp(r, smem[1]);
 114:       
 115:         r = reduceOp(r, smem[2]);
 116:       
 117:         r = reduceOp(r, smem[3]);
 118:       
 119:         r = reduceOp(r, smem[4]);
 120:       
 121:         r = reduceOp(r, smem[5]);
 122:       
 123:         r = reduceOp(r, smem[6]);
 124:       
 125:         r = reduceOp(r, smem[7]);
 126:       
 127:         r = reduceOp(r, smem[8]);
 128:       
 129:         r = reduceOp(r, smem[9]);
 130:       
 131:         r = reduceOp(r, smem[10]);
 132:       
 133:         r = reduceOp(r, smem[11]);
 134:       
 135:         r = reduceOp(r, smem[12]);
 136:       
 137:         r = reduceOp(r, smem[13]);
 138:       
 139:         r = reduceOp(r, smem[14]);
 140:       
 141:         r = reduceOp(r, smem[15]);
 142:       
 143:         r = reduceOp(r, smem[16]);
 144:       
 145:         r = reduceOp(r, smem[17]);
 146:       
 147:         r = reduceOp(r, smem[18]);
 148:       
 149:         r = reduceOp(r, smem[19]);
 150:       
 151:         r = reduceOp(r, smem[20]);
 152:       
 153:         r = reduceOp(r, smem[21]);
 154:       
 155:         r = reduceOp(r, smem[22]);
 156:       
 157:         r = reduceOp(r, smem[23]);
 158:       
 159:         r = reduceOp(r, smem[24]);
 160:       
 161:         r = reduceOp(r, smem[25]);
 162:       
 163:         r = reduceOp(r, smem[26]);
 164:       
 165:         r = reduceOp(r, smem[27]);
 166:       
 167:         r = reduceOp(r, smem[28]);
 168:       
 169:         r = reduceOp(r, smem[29]);
 170:       
 171:         r = reduceOp(r, smem[30]);
 172:       
 173:         r = reduceOp(r, smem[31]);
 174:       
 175:     } else {
 176:       for (int i = 1; i < numLanesParticipating; ++i) {
 177:         r = reduceOp(r, smem[i]);
 178:       }
 179:     }
 180:   }
 181: 
 182:   return r;
 183: }
 184: 
 185: 
 186: 
 187: 
 188: // Kernel that handles an entire reduction of a tensor in one pass
 189: kernel void
 190: THClTensor_reduceAll(global TensorInfoCl *in_info,
 191:                      global float *in_data,
 192:                      unsigned int totalElements,
 193:                      float init,
 194:                      global float* out,
 195:                      local float *smem) {
 196:   // With a block-wide stride, have each thread perform its own reduction.
 197:   float r = init;
 198:   for (unsigned int i = get_local_id(0); i < totalElements; i += get_local_size(0)) {
 199:     const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]);
 200:     r = reduceOp(r, modifyOp(in_data[inOffset]));
 201:   }
 202: 
 203:   // Reduce within the block
 204:   r = reduceBlock(smem, get_local_size(0), r, init);
 205: 
 206:   if(get_local_id(0) == 0) {
 207:     // Write out reduced value
 208:     out[0] = r;
 209:   }
 210: }
 211: 
 212: inline unsigned int getStartIndex(unsigned int totalSize) {
 213:   unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0));
 214:   return get_group_id(0) * sizePerBlock;
 215: }
 216: 
 217: inline unsigned int getEndIndex(unsigned int totalSize) {
 218:   unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0));
 219:   return min((unsigned int) ((get_group_id(0) + 1) * sizePerBlock), totalSize);
 220: }
 221: 
 222: // Kernel that handles an entire reduction of a tensor in two passes
 223: kernel void
 224: THClTensor_reduceAllPass1(global TensorInfoCl *in_info,
 225:                           global float *in_data,
 226:                           unsigned int totalElements,
 227:                           float init,
 228:                           global float* scratchSpace,
 229:                           local float *smem) {
 230:   const unsigned int startIndex = getStartIndex(totalElements);
 231:   const unsigned int endIndex = getEndIndex(totalElements);
 232: 
 233:   // With a block-wide stride, have each thread perform its own reduction.
 234:   float r = init;
 235:   for (unsigned int i = startIndex + get_local_id(0); i < endIndex; i += get_local_size(0)) {
 236:     const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]);
 237:     r = reduceOp(r, modifyOp(in_data[inOffset]));
 238:   }
 239: 
 240:   // Reduce within the block
 241:   r = reduceBlock(smem, get_local_size(0), r, init);
 242: 
 243:   if ((int)get_local_id(0) == 0) {
 244:     // Write out block-wide reduced value
 245:     scratchSpace[get_group_id(0)] = r;
 246:   }
 247: }
 248: 
 249: kernel void THClTensor_reduceAllPass2(int numPass1Blocks,
 250:                             float init,
 251:                             global float* scratchSpace,
 252:                             global float* out,
 253:                             local float *smem) {
 254:   float r = init;
 255:   if ((int)get_local_id(0) < numPass1Blocks) {
 256:     r = scratchSpace[get_local_id(0)];
 257:   }
 258: 
 259:   // Reduce within the block
 260:   r = reduceBlock(smem, numPass1Blocks, r, init);
 261: 
 262:   if((int)get_local_id(0) == 0) {
 263:     out[0] = r;
 264:   }
 265: }
 266: 
 267: 
 268: 
 269: 


 Something went wrong with clCreateKernel, OpenCL erorr code -45
 THClReduceAll.cl build log: 
 unsupported call to function reduceBlock in THClTensor_reduceAll
 ____*________****______________________________________*****_____***  ==> Done                 

 Completed 82 asserts in 68 tests with 13 errors

 --------------------------------------------------------------------------------
 ClassNLLCriterionMultipleTarget
 Function call failed 
 C++ exception

 --------------------------------------------------------------------------------
 LogSoftMax_backward
 Function call failed 
 C++ exception

 --------------------------------------------------------------------------------
 LogSoftMax_backward_batch
 Function call failed 
 C++ exception

 --------------------------------------------------------------------------------
 LogSoftMax_forward
 Function call failed 
 C++ exception

 --------------------------------------------------------------------------------
 LogSoftMax_forward_batch
 Function call failed 
 C++ exception

 --------------------------------------------------------------------------------
 Sum_backward
 Function call failed 
 /home/lcestari/torch/install/share/lua/5.1/nn/Sum.lua:27: Something went wrong: 
 kernel source:
 1: // Threads per thread block
 2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
 3: 
 4: inline float modifyOp(float _in1) {
 5:   float _out;
 6:   float *in1 = &_in1;
 7:   float *out = &_out;
 8:   *out = *in1;
 9:   return _out;
 10: }
 11: 
 12: inline float reduceOp(float _in1, float _in2) {
 13:   // I guess the compiler can sort this stuff out :-P
 14:   float _out;
 15:   float *in1 = &_in1;
 16:   float *in2 = &_in2;
 17:   float *out = &_out;
 18:   *out = *in1 + *in2;
 19:   return _out;
 20: }
 21: 
 22: // kernel argument that defines tensor layout
 23: typedef struct TensorInfoCl {
 24:   // Extracts size/stride information for the kernel.
 25:   // Successive dimensions can be collapsed if the size/strides match
 26:   // up and thus there are no holes between the dimensions. This is used
 27:   // to reduce the complexity of the problem.
 28:   // The optional `reduceDim` indicates a reduction dimension for the
 29:   // given tensor, so that the output size for this dimension will be 1.
 30: 
 31:   int sizes[25];
 32:   int strides[25];
 33:   int offset;
 34:   int dims;
 35: } TensorInfoCl;
 36: // Contiguous tensors of more than one dimension are collapsed down
 37: // to one tensor
 38: 
 39: 
 40: // Translate a linear index for the apply to a float* offset;
 41: // specialized on `Dims` to reduce nvcc compilation time
 42: 
 43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) {
 44:   int offset = info->offset;
 45: 
 46:   // Use static dims
 47: //  for (int i = 1 - 1; i >= 0; --i) {
 48:   int curDimIndex;
 49:   int curDimOffset;
 50:     // bake this in....
 51:     curDimIndex = linearId  64nfo->sizes[0];
 52:     curDimOffset = curDimIndex * info->strides[0];
 53:     offset += curDimOffset;
 54: 
 55:     
 56:   
 57: //  }
 58: 
 59:   return offset;
 60: }
 61: 
 62: 
 63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) {
 64:     return linearId + info->offset;
 65: }
 66: 
 67: inline int IndexToOffset_999_get(int linearId, global const Te
 stack traceback:
 	[C]: in function 'sum'
 	/home/lcestari/torch/install/share/lua/5.1/nn/Sum.lua:27: in function 'forward'
 	/home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:731: in function 'v'
 	/home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:2557: in function </home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:2555>
 	[C]: in function 'xpcall'
 	/home/lcestari/torch/install/share/lua/5.1/torch/Tester.lua:115: in function 'pcall'
 	/home/lcestari/torch/install/share/lua/5.1/torch/Tester.lua:186: in function '_run'
 	/home/lcestari/torch/install/share/lua/5.1/torch/Tester.lua:161: in function 'run'
 	/home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:2596: in function 'test'
 	(command line):1: in main chunk
 	[C]: at 0x00405c20

 --------------------------------------------------------------------------------
 Sum_forward
 Function call failed 
 /home/lcestari/torch/install/share/lua/5.1/nn/Sum.lua:27: Something went wrong: 
 kernel source:
 1: // Threads per thread block
 2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
 3: 
 4: inline float modifyOp(float _in1) {
 5:   float _out;
 6:   float *in1 = &_in1;
 7:   float *out = &_out;
 8:   *out = *in1;
 9:   return _out;
 10: }
 11: 
 12: inline float reduceOp(float _in1, float _in2) {
 13:   // I guess the compiler can sort this stuff out :-P
 14:   float _out;
 15:   float *in1 = &_in1;
 16:   float *in2 = &_in2;
 17:   float *out = &_out;
 18:   *out = *in1 + *in2;
 19:   return _out;
 20: }
 21: 
 22: // kernel argument that defines tensor layout
 23: typedef struct TensorInfoCl {
 24:   // Extracts size/stride information for the kernel.
 25:   // Successive dimensions can be collapsed if the size/strides match
 26:   // up and thus there are no holes between the dimensions. This is used
 27:   // to reduce the complexity of the problem.
 28:   // The optional `reduceDim` indicates a reduction dimension for the
 29:   // given tensor, so that the output size for this dimension will be 1.
 30: 
 31:   int sizes[25];
 32:   int strides[25];
 33:   int offset;
 34:   int dims;
 35: } TensorInfoCl;
 36: // Contiguous tensors of more than one dimension are collapsed down
 37: // to one tensor
 38: 
 39: 
 40: // Translate a linear index for the apply to a float* offset;
 41: // specialized on `Dims` to reduce nvcc compilation time
 42: 
 43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) {
 44:   int offset = info->offset;
 45: 
 46:   // Use static dims
 47: //  for (int i = 1 - 1; i >= 0; --i) {
 48:   int curDimIndex;
 49:   int curDimOffset;
 50:     // bake this in....
 51:     curDimIndex = linearId  64nfo->sizes[0];
 52:     curDimOffset = curDimIndex * info->strides[0];
 53:     offset += curDimOffset;
 54: 
 55:     
 56:   
 57: //  }
 58: 
 59:   return offset;
 60: }
 61: 
 62: 
 63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) {
 64:     return linearId + info->offset;
 65: }
 66: 
 67: inline int IndexToOffset_999_get(int linearId, global const Te
 stack traceback:
 	[C]: in function 'sum'
 	/home/lcestari/torch/install/share/lua/5.1/nn/Sum.lua:27: in function 'forward'
 	/home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:697: in function 'v'
 	/home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:2557: in function </home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:2555>
 	[C]: in function 'xpcall'
 	/home/lcestari/torch/install/share/lua/5.1/torch/Tester.lua:115: in function 'pcall'
 	/home/lcestari/torch/install/share/lua/5.1/torch/Tester.lua:186: in function '_run'
 	/home/lcestari/torch/install/share/lua/5.1/torch/Tester.lua:161: in function 'run'
 	/home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:2596: in function 'test'
 	(command line):1: in main chunk
 	[C]: at 0x00405c20

 --------------------------------------------------------------------------------
 Tanh_backward
 Function call failed 
 C++ exception

 --------------------------------------------------------------------------------
 Tanh_forward
 Function call failed 
 C++ exception

 --------------------------------------------------------------------------------
 Tanh_transposed
 Function call failed 
 C++ exception

 --------------------------------------------------------------------------------
 mse
 Function call failed 
 C++ exception

 --------------------------------------------------------------------------------
 mse_nosizeaverage
 Function call failed 
 C++ exception

 --------------------------------------------------------------------------------
 mse_variablebatchsize
 Function call failed 
 C++ exception

 --------------------------------------------------------------------------------