coodoo · March 27, 2016 01:18
diff --git a/code.lua b/code.lua
 -- simple case to reproduce GPU crash bug
 require 'paths'
 require 'rnn'
 require 'nngraph'
 require 'cltorch'
 require 'clnn'

 cltorch.setDevice(1)

 ------------------------------------------------------------------------------------
 -- model
 ------------------------------------------------------------------------------------

 local lm = nn.Sequential()
 local lookup = nn.LookupTable( 10000, 200 )
 lookup.maxnormout = -1
 lm:add( lookup )
 lm:add( nn.SplitTable(1) )

 -- rnn layer
 local stepmodule = nn.Sequential()
 stepmodule:add( nn.FastLSTM( 200, 200 ) )
 stepmodule:add( nn.Linear(200, 10000) )
 stepmodule:add(nn.LogSoftMax())

 lm:add(nn.Sequencer(stepmodule))
 lm:remember( 'both' )

 ------------------------------------------------------------------------------------
 -- test case
 ------------------------------------------------------------------------------------

 -- when set to false, ran fine on CPU
 -- when set to true, crashes GPU
 local useGPU = true

 local inputs = torch.randn(5, 32):fill(1)

 if useGPU == true then
 	lm:cl()
 	inputs = inputs:cl()
 end

 local outputs = lm:forward( inputs )
 print(outputs)

 -- switch to evaluate mode
 lm:evaluate()

 -- change dimension from [5x32] to [5x2], this will crash GPU, see error message at the end
 inputs = torch.randn(5, 2):fill(2)

 print('\n\nnew dimension')
 outputs = lm:forward( inputs )
 print(outputs)

 --[[

  Invalid work group size, code -54
  /Users/jlu/torch/install/bin/luajit: /Users/jlu/torch/install/share/lua/5.1/clnn/LogSoftMax.lua:41:
  kernel source:
  1: // Threads per thread block
  2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
  3:


 ]]--
diff --git a/full_dump.log b/full_dump.log
 $ th a0.lua
 libthclnn_searchpath	/Users/jlu/torch/install/lib/lua/5.1/libTHCLNN.so	
 Using Apple , OpenCL platform: Apple
 Using OpenCL device: Iris
 {
  1 : ClTensor - size: 32x10000
  2 : ClTensor - size: 32x10000
  3 : ClTensor - size: 32x10000
  4 : ClTensor - size: 32x10000
  5 : ClTensor - size: 32x10000
 }


 new dimension	

 kernel source:
 1: // Threads per thread block
 2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
 3: 
 4: inline float modifyOp(float _in1) {
 5:   float _out;
 6:   float *in1 = &_in1;
 7:   float *out = &_out;
 8:   *out = *in1;
 9:   return _out;
 10: }
 11: 
 12: inline float reduceOp(float _in1, float _in2) {
 13:   // I guess the compiler can sort this stuff out :-P
 14:   float _out;
 15:   float *in1 = &_in1;
 16:   float *in2 = &_in2;
 17:   float *out = &_out;
 18:   *out = *in1 + *in2;
 19:   return _out;
 20: }
 21: 
 22: // kernel argument that defines tensor layout
 23: typedef struct TensorInfoCl {
 24:   // Extracts size/stride information for the kernel.
 25:   // Successive dimensions can be collapsed if the size/strides match
 26:   // up and thus there are no holes between the dimensions. This is used
 27:   // to reduce the complexity of the problem.
 28:   // The optional `reduceDim` indicates a reduction dimension for the
 29:   // given tensor, so that the output size for this dimension will be 1.
 30: 
 31:   int sizes[25];
 32:   int strides[25];
 33:   int offset;
 34:   int dims;
 35: } TensorInfoCl;
 36: // Contiguous tensors of more than one dimension are collapsed down
 37: // to one tensor
 38: 
 39: 
 40: // Translate a linear index for the apply to a float* offset;
 41: // specialized on `Dims` to reduce nvcc compilation time
 42: 
 43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) {
 44:   int offset = info->offset;
 45: 
 46:   // Use static dims
 47: //  for (int i = 1 - 1; i >= 0; --i) {
 48:   int curDimIndex;
 49:   int curDimOffset;
 50:     // bake this in....
 51:     curDimIndex = linearId % info->sizes[0];
 52:     curDimOffset = curDimIndex * info->strides[0];
 53:     offset += curDimOffset;
 54: 
 55:     
 56:   
 57: //  }
 58: 
 59:   return offset;
 60: }
 61: 
 62: 
 63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) {
 64:     return linearId + info->offset;
 65: }
 66: 
 67: inline int IndexToOffset_999_get(int linearId, global const TensorInfoCl *info) {
 68:   int offset = info->offset;
 69: 
 70:   // Use dynamic dims
 71:   for (int i = info->dims - 1; i >= 0; --i) {
 72:     int curDimIndex = linearId % info->sizes[i];
 73:     int curDimOffset = curDimIndex * info->strides[i];
 74:     offset += curDimOffset;
 75: 
 76:     linearId /= info->sizes[i];
 77:   }
 78: 
 79:   return offset;
 80: }
 81: 
 82: inline int getLinearBlockId() {
 83:   return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
 84:     get_group_id(1) * get_num_groups(0) +
 85:     get_group_id(0);
 86: }
 87: 
 88: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will
 89: // return the reduced value
 90: 
 91: inline float reduceBlock( local float* smem,
 92:                    int numVals,
 93:                    float threadVal,
 94:                    float init) {
 95:   if (numVals == 0) {
 96:     return init;
 97:   }
 98: 
 99:   if ((int)get_local_id(0) < numVals) {
 100:     smem[ get_local_id(0)] = threadVal;
 101:   }
 102: 
 103:   // First warp will perform reductions across warps
 104:   barrier(CLK_LOCAL_MEM_FENCE);
 105:   if ((get_local_id(0) / 32) == 0) {
 106:     float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init;
 107: 
 108:     for (int i = 32 + get_local_id(0); i < numVals; i += 32) {
 109:       r = reduceOp(r, smem[i]);
 110:     }
 111: 
 112:     smem[get_local_id(0)] = r;
 113:   }
 114: 
 115:   // First thread will perform reductions across the block
 116:   barrier(CLK_LOCAL_MEM_FENCE);
 117: 
 118:   float r = init;
 119:   if (get_local_id(0) == 0) {
 120:     r = smem[0];
 121: 
 122:     int numLanesParticipating = min(numVals, 32);
 123: 
 124:     if (numLanesParticipating == 32) {
 125:       // Unroll for 32 == 32 and numVals >= 32
 126:       // #pragma unroll
 127:       // unrolling by hand, so compiler-independent
 128:       
 129:         r = reduceOp(r, smem[1]);
 130:       
 131:         r = reduceOp(r, smem[2]);
 132:       
 133:         r = reduceOp(r, smem[3]);
 134:       
 135:         r = reduceOp(r, smem[4]);
 136:       
 137:         r = reduceOp(r, smem[5]);
 138:       
 139:         r = reduceOp(r, smem[6]);
 140:       
 141:         r = reduceOp(r, smem[7]);
 142:       
 143:         r = reduceOp(r, smem[8]);
 144:       
 145:         r = reduceOp(r, smem[9]);
 146:       
 147:         r = reduceOp(r, smem[10]);
 148:       
 149:         r = reduceOp(r, smem[11]);
 150:       
 151:         r = reduceOp(r, smem[12]);
 152:       
 153:         r = reduceOp(r, smem[13]);
 154:       
 155:         r = reduceOp(r, smem[14]);
 156:       
 157:         r = reduceOp(r, smem[15]);
 158:       
 159:         r = reduceOp(r, smem[16]);
 160:       
 161:         r = reduceOp(r, smem[17]);
 162:       
 163:         r = reduceOp(r, smem[18]);
 164:       
 165:         r = reduceOp(r, smem[19]);
 166:       
 167:         r = reduceOp(r, smem[20]);
 168:       
 169:         r = reduceOp(r, smem[21]);
 170:       
 171:         r = reduceOp(r, smem[22]);
 172:       
 173:         r = reduceOp(r, smem[23]);
 174:       
 175:         r = reduceOp(r, smem[24]);
 176:       
 177:         r = reduceOp(r, smem[25]);
 178:       
 179:         r = reduceOp(r, smem[26]);
 180:       
 181:         r = reduceOp(r, smem[27]);
 182:       
 183:         r = reduceOp(r, smem[28]);
 184:       
 185:         r = reduceOp(r, smem[29]);
 186:       
 187:         r = reduceOp(r, smem[30]);
 188:       
 189:         r = reduceOp(r, smem[31]);
 190:       
 191:     } else {
 192:       for (int i = 1; i < numLanesParticipating; ++i) {
 193:         r = reduceOp(r, smem[i]);
 194:       }
 195:     }
 196:   }
 197: 
 198:   return r;
 199: }
 200: 
 201: 
 202: 
 203: 
 204: inline int getReduceNoncontigDimSliceIndex() {
 205:   // Each thread handles one slice
 206:   return getLinearBlockId() * THCL_NONCONTIG_REDUCE_BLOCK_SIZE + /*threadIdx.x*/ get_local_id(0);
 207: }
 208: 
 209: // Kernel that handles an entire reduction of a slice of a tensor per each thread
 210: kernel void
 211: THClTensor_reduceNoncontigDim(global TensorInfoCl *out_info,
 212:                               global float *out_data,
 213:                               global TensorInfoCl *in_info,
 214:                               global float *in_data,
 215:                               int reductionStride,
 216:                               int reductionSize,
 217:                               int totalSlices,
 218:                               float init) {
 219:   const int sliceIndex = getReduceNoncontigDimSliceIndex();
 220: 
 221:   if ((int)sliceIndex >= totalSlices) {
 222:     return;
 223:   }
 224: 
 225:   // Each thread picks a point in `out` and `in` for which it is
 226:   // producing the reduction
 227:   const int outOffset =
 228:     IndexToOffset_998_get(sliceIndex, &out_info[0]);
 229:   const int inBaseOffset =
 230:     IndexToOffset_1001_get(sliceIndex, &in_info[0]);
 231: 
 232:   // For each point in reductionSize, reduce into `r`
 233:   int inOffset = inBaseOffset;
 234:   float r = init;
 235: 
 236:   for (int i = 0; (int)i < reductionSize; ++i) {
 237:     r = reduceOp(r, modifyOp(in_data[inOffset]));
 238:     inOffset += reductionStride;
 239:   }
 240: 
 241:   // Write out reduced value
 242:   out_data[outOffset] = r;
 243: }
 244: 
 245: inline int getReduceContigDimSliceIndex() {
 246:   // Each block handles one slice
 247:   return getLinearBlockId();
 248: }
 249: 
 250: // Kernel that handles an entire reduction of a slice of a tensor per
 251: // each block
 252: kernel void
 253: THClTensor_reduceContigDim(global TensorInfoCl *out_info,
 254:                            global float *out_data,
 255:                            global TensorInfoCl *in_info,
 256:                            global float *in_data,
 257:                            int reductionSize,
 258:                            int totalSlices,
 259:                            float init,
 260:                            local float *smem) {
 261:   const int sliceIndex = getReduceContigDimSliceIndex();
 262: 
 263:   if ((int)sliceIndex >= totalSlices) {
 264:     return;
 265:   }
 266: 
 267:   // Get the offset in `out` for the reduction
 268:   const int outOffset =
 269:     IndexToOffset_998_get(sliceIndex, &out_info[0]);
 270: 
 271:   // Get the base offset in `in` for this block's reduction
 272:   const int inBaseOffset =
 273:     IndexToOffset_1001_get(sliceIndex, &in_info[0]);
 274: 
 275:   // Each thread in the block will reduce some subset of elements in
 276:   // the slice. The elements are guaranteed contiguous starting at
 277:   // `inBaseOffset`.
 278:   float r = init;
 279:   for (int i = /*threadIdx.x*/ get_local_id(0); (int)i < reductionSize; i += /*blockDim.x*/ get_local_size(0)) {
 280:     r = reduceOp(r, modifyOp(in_data[inBaseOffset + i]));
 281:   }
 282: 
 283:   // Reduce within the block
 284: //  extern __shared__ float smem[];
 285:   r = reduceBlock(smem, /*blockDim.x*/ get_local_size(0), r, init);
 286: 
 287:   if (/*threadIdx.x*/ get_local_id(0) == 0) {
 288:     // Write out reduced value
 289:     out_data[outOffset] = r;
 290:   }
 291: }
 292: 
 293: 

 Invalid work group size, code -54
 /Users/jlu/torch/install/bin/luajit: /Users/jlu/torch/install/share/lua/5.1/clnn/LogSoftMax.lua:41: 
 kernel source:
 1: // Threads per thread block
 2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
 3: 
 4: inline float modifyOp(float _in1) {
 5:   float _out;
 6:   float *in1 = &_in1;
 7:   float *out = &_out;
 8:   *out = *in1;
 9:   return _out;
 10: }
 11: 
 12: inline float reduceOp(float _in1, float _in2) {
 13:   // I guess the compiler can sort this stuff out :-P
 14:   float _out;
 15:   float *in1 = &_in1;
 16:   float *in2 = &_in2;
 17:   float *out = &_out;
 18:   *out = *in1 + *in2;
 19:   return _out;
 20: }
 21: 
 22: // kernel argument that defines tensor layout
 23: typedef struct TensorInfoCl {
 24:   // Extracts size/stride information for the kernel.
 25:   // Successive dimensions can be collapsed if the size/strides match
 26:   // up and thus there are no holes between the dimensions. This is used
 27:   // to reduce the complexity of the problem.
 28:   // The optional `reduceDim` indicates a reduction dimension for the
 29:   // given tensor, so that the output size for this dimension will be 1.
 30: 
 31:   int sizes[25];
 32:   int strides[25];
 33:   int offset;
 34:   int dims;
 35: } TensorInfoCl;
 36: // Contiguous tensors of more than one dimension are collapsed down
 37: // to one tensor
 38: 
 39: 
 40: // Translate a linear index for the apply to a float* offset;
 41: // specialized on `Dims` to reduce nvcc compilation time
 42: 
 43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) {
 44:   int offset = info->offset;
 45: 
 46:   // Use static dims
 47: //  for (int i = 1 - 1; i >= 0; --i) {
 48:   int curDimIndex;
 49:   int curDimOffset;
 50:     // bake this in....
 51:     curDimIndex = linearId  36263448nfo->sizes[0];
 52:     curDimOffset = curDimIndex * info->strides[0];
 53:     offset += curDimOffset;
 54: 
 55:     
 56:   
 57: //  }
 58: 
 59:   return offset;
 60: }
 61: 
 62: 
 63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) {
 64:     return linearId + info->offset;
 65: }
 66: 
 67: inline int IndexToOffset_999_get(int linearId, global const TensorInfoCl *info
 stack traceback:
 	[C]: in function 'sum'
 	/Users/jlu/torch/install/share/lua/5.1/clnn/LogSoftMax.lua:41: in function 'updateOutput'
 	/Users/jlu/torch/install/share/lua/5.1/nn/Sequential.lua:44: in function 'updateOutput'
 	/Users/jlu/torch/install/share/lua/5.1/rnn/Recursor.lua:27: in function 'updateOutput'
 	/Users/jlu/torch/install/share/lua/5.1/rnn/Sequencer.lua:59: in function 'updateOutput'
 	/Users/jlu/torch/install/share/lua/5.1/nn/Sequential.lua:44: in function 'forward'
 	a0.lua:54: in main chunk
 	[C]: in function 'dofile'
 	.../jlu/torch/install/lib/luarocks/rocks/trepl/scm-1/bin/th:145: in main chunk
 	[C]: at 0x01032c0be0
	-- simple case to reproduce GPU crash bug
	require 'paths'
	require 'rnn'
	require 'nngraph'
	require 'cltorch'
	require 'clnn'

	cltorch.setDevice(1)

	------------------------------------------------------------------------------------
	-- model
	------------------------------------------------------------------------------------

	local lm = nn.Sequential()
	local lookup = nn.LookupTable( 10000, 200 )
	lookup.maxnormout = -1
	lm:add( lookup )
	lm:add( nn.SplitTable(1) )

	-- rnn layer
	local stepmodule = nn.Sequential()
	stepmodule:add( nn.FastLSTM( 200, 200 ) )
	stepmodule:add( nn.Linear(200, 10000) )
	stepmodule:add(nn.LogSoftMax())

	lm:add(nn.Sequencer(stepmodule))
	lm:remember( 'both' )

	------------------------------------------------------------------------------------
	-- test case
	------------------------------------------------------------------------------------

	-- when set to false, ran fine on CPU
	-- when set to true, crashes GPU
	local useGPU = true

	local inputs = torch.randn(5, 32):fill(1)

	if useGPU == true then
	lm:cl()
	inputs = inputs:cl()
	end

	local outputs = lm:forward( inputs )
	print(outputs)

	-- switch to evaluate mode
	lm:evaluate()

	-- change dimension from [5x32] to [5x2], this will crash GPU, see error message at the end
	inputs = torch.randn(5, 2):fill(2)

	print('\n\nnew dimension')
	outputs = lm:forward( inputs )
	print(outputs)

	--[[

	Invalid work group size, code -54
	/Users/jlu/torch/install/bin/luajit: /Users/jlu/torch/install/share/lua/5.1/clnn/LogSoftMax.lua:41:
	kernel source:
	1: // Threads per thread block
	2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
	3:


	]]--
	$ th a0.lua
	libthclnn_searchpath /Users/jlu/torch/install/lib/lua/5.1/libTHCLNN.so
	Using Apple , OpenCL platform: Apple
	Using OpenCL device: Iris
	{
	1 : ClTensor - size: 32x10000
	2 : ClTensor - size: 32x10000
	3 : ClTensor - size: 32x10000
	4 : ClTensor - size: 32x10000
	5 : ClTensor - size: 32x10000
	}


	new dimension

	kernel source:
	1: // Threads per thread block
	2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
	3:
	4: inline float modifyOp(float _in1) {
	5: float _out;
	6: float *in1 = &_in1;
	7: float *out = &_out;
	8: out = in1;
	9: return _out;
	10: }
	11:
	12: inline float reduceOp(float _in1, float _in2) {
	13: // I guess the compiler can sort this stuff out :-P
	14: float _out;
	15: float *in1 = &_in1;
	16: float *in2 = &_in2;
	17: float *out = &_out;
	18: out = in1 + *in2;
	19: return _out;
	20: }
	21:
	22: // kernel argument that defines tensor layout
	23: typedef struct TensorInfoCl {
	24: // Extracts size/stride information for the kernel.
	25: // Successive dimensions can be collapsed if the size/strides match
	26: // up and thus there are no holes between the dimensions. This is used
	27: // to reduce the complexity of the problem.
	28: // The optional `reduceDim` indicates a reduction dimension for the
	29: // given tensor, so that the output size for this dimension will be 1.
	30:
	31: int sizes[25];
	32: int strides[25];
	33: int offset;
	34: int dims;
	35: } TensorInfoCl;
	36: // Contiguous tensors of more than one dimension are collapsed down
	37: // to one tensor
	38:
	39:
	40: // Translate a linear index for the apply to a float* offset;
	41: // specialized on `Dims` to reduce nvcc compilation time
	42:
	43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) {
	44: int offset = info->offset;
	45:
	46: // Use static dims
	47: // for (int i = 1 - 1; i >= 0; --i) {
	48: int curDimIndex;
	49: int curDimOffset;
	50: // bake this in....
	51: curDimIndex = linearId % info->sizes[0];
	52: curDimOffset = curDimIndex * info->strides[0];
	53: offset += curDimOffset;
	54:
	55:
	56:
	57: // }
	58:
	59: return offset;
	60: }
	61:
	62:
	63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) {
	64: return linearId + info->offset;
	65: }
	66:
	67: inline int IndexToOffset_999_get(int linearId, global const TensorInfoCl *info) {
	68: int offset = info->offset;
	69:
	70: // Use dynamic dims
	71: for (int i = info->dims - 1; i >= 0; --i) {
	72: int curDimIndex = linearId % info->sizes[i];
	73: int curDimOffset = curDimIndex * info->strides[i];
	74: offset += curDimOffset;
	75:
	76: linearId /= info->sizes[i];
	77: }
	78:
	79: return offset;
	80: }
	81:
	82: inline int getLinearBlockId() {
	83: return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
	84: get_group_id(1) * get_num_groups(0) +
	85: get_group_id(0);
	86: }
	87:
	88: // Block-wide reduction in shared memory helper; only /threadIdx.x/ get_local_id(0) == 0 will
	89: // return the reduced value
	90:
	91: inline float reduceBlock( local float* smem,
	92: int numVals,
	93: float threadVal,
	94: float init) {
	95: if (numVals == 0) {
	96: return init;
	97: }
	98:
	99: if ((int)get_local_id(0) < numVals) {
	100: smem[ get_local_id(0)] = threadVal;
	101: }
	102:
	103: // First warp will perform reductions across warps
	104: barrier(CLK_LOCAL_MEM_FENCE);
	105: if ((get_local_id(0) / 32) == 0) {
	106: float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init;
	107:
	108: for (int i = 32 + get_local_id(0); i < numVals; i += 32) {
	109: r = reduceOp(r, smem[i]);
	110: }
	111:
	112: smem[get_local_id(0)] = r;
	113: }
	114:
	115: // First thread will perform reductions across the block
	116: barrier(CLK_LOCAL_MEM_FENCE);
	117:
	118: float r = init;
	119: if (get_local_id(0) == 0) {
	120: r = smem[0];
	121:
	122: int numLanesParticipating = min(numVals, 32);
	123:
	124: if (numLanesParticipating == 32) {
	125: // Unroll for 32 == 32 and numVals >= 32
	126: // #pragma unroll
	127: // unrolling by hand, so compiler-independent
	128:
	129: r = reduceOp(r, smem[1]);
	130:
	131: r = reduceOp(r, smem[2]);
	132:
	133: r = reduceOp(r, smem[3]);
	134:
	135: r = reduceOp(r, smem[4]);
	136:
	137: r = reduceOp(r, smem[5]);
	138:
	139: r = reduceOp(r, smem[6]);
	140:
	141: r = reduceOp(r, smem[7]);
	142:
	143: r = reduceOp(r, smem[8]);
	144:
	145: r = reduceOp(r, smem[9]);
	146:
	147: r = reduceOp(r, smem[10]);
	148:
	149: r = reduceOp(r, smem[11]);
	150:
	151: r = reduceOp(r, smem[12]);
	152:
	153: r = reduceOp(r, smem[13]);
	154:
	155: r = reduceOp(r, smem[14]);
	156:
	157: r = reduceOp(r, smem[15]);
	158:
	159: r = reduceOp(r, smem[16]);
	160:
	161: r = reduceOp(r, smem[17]);
	162:
	163: r = reduceOp(r, smem[18]);
	164:
	165: r = reduceOp(r, smem[19]);
	166:
	167: r = reduceOp(r, smem[20]);
	168:
	169: r = reduceOp(r, smem[21]);
	170:
	171: r = reduceOp(r, smem[22]);
	172:
	173: r = reduceOp(r, smem[23]);
	174:
	175: r = reduceOp(r, smem[24]);
	176:
	177: r = reduceOp(r, smem[25]);
	178:
	179: r = reduceOp(r, smem[26]);
	180:
	181: r = reduceOp(r, smem[27]);
	182:
	183: r = reduceOp(r, smem[28]);
	184:
	185: r = reduceOp(r, smem[29]);
	186:
	187: r = reduceOp(r, smem[30]);
	188:
	189: r = reduceOp(r, smem[31]);
	190:
	191: } else {
	192: for (int i = 1; i < numLanesParticipating; ++i) {
	193: r = reduceOp(r, smem[i]);
	194: }
	195: }
	196: }
	197:
	198: return r;
	199: }
	200:
	201:
	202:
	203:
	204: inline int getReduceNoncontigDimSliceIndex() {
	205: // Each thread handles one slice
	206: return getLinearBlockId() * THCL_NONCONTIG_REDUCE_BLOCK_SIZE + /threadIdx.x/ get_local_id(0);
	207: }
	208:
	209: // Kernel that handles an entire reduction of a slice of a tensor per each thread
	210: kernel void
	211: THClTensor_reduceNoncontigDim(global TensorInfoCl *out_info,
	212: global float *out_data,
	213: global TensorInfoCl *in_info,
	214: global float *in_data,
	215: int reductionStride,
	216: int reductionSize,
	217: int totalSlices,
	218: float init) {
	219: const int sliceIndex = getReduceNoncontigDimSliceIndex();
	220:
	221: if ((int)sliceIndex >= totalSlices) {
	222: return;
	223: }
	224:
	225: // Each thread picks a point in `out` and `in` for which it is
	226: // producing the reduction
	227: const int outOffset =
	228: IndexToOffset_998_get(sliceIndex, &out_info[0]);
	229: const int inBaseOffset =
	230: IndexToOffset_1001_get(sliceIndex, &in_info[0]);
	231:
	232: // For each point in reductionSize, reduce into `r`
	233: int inOffset = inBaseOffset;
	234: float r = init;
	235:
	236: for (int i = 0; (int)i < reductionSize; ++i) {
	237: r = reduceOp(r, modifyOp(in_data[inOffset]));
	238: inOffset += reductionStride;
	239: }
	240:
	241: // Write out reduced value
	242: out_data[outOffset] = r;
	243: }
	244:
	245: inline int getReduceContigDimSliceIndex() {
	246: // Each block handles one slice
	247: return getLinearBlockId();
	248: }
	249:
	250: // Kernel that handles an entire reduction of a slice of a tensor per
	251: // each block
	252: kernel void
	253: THClTensor_reduceContigDim(global TensorInfoCl *out_info,
	254: global float *out_data,
	255: global TensorInfoCl *in_info,
	256: global float *in_data,
	257: int reductionSize,
	258: int totalSlices,
	259: float init,
	260: local float *smem) {
	261: const int sliceIndex = getReduceContigDimSliceIndex();
	262:
	263: if ((int)sliceIndex >= totalSlices) {
	264: return;
	265: }
	266:
	267: // Get the offset in `out` for the reduction
	268: const int outOffset =
	269: IndexToOffset_998_get(sliceIndex, &out_info[0]);
	270:
	271: // Get the base offset in `in` for this block's reduction
	272: const int inBaseOffset =
	273: IndexToOffset_1001_get(sliceIndex, &in_info[0]);
	274:
	275: // Each thread in the block will reduce some subset of elements in
	276: // the slice. The elements are guaranteed contiguous starting at
	277: // `inBaseOffset`.
	278: float r = init;
	279: for (int i = /threadIdx.x/ get_local_id(0); (int)i < reductionSize; i += /blockDim.x/ get_local_size(0)) {
	280: r = reduceOp(r, modifyOp(in_data[inBaseOffset + i]));
	281: }
	282:
	283: // Reduce within the block
	284: // extern __shared__ float smem[];
	285: r = reduceBlock(smem, /blockDim.x/ get_local_size(0), r, init);
	286:
	287: if (/threadIdx.x/ get_local_id(0) == 0) {
	288: // Write out reduced value
	289: out_data[outOffset] = r;
	290: }
	291: }
	292:
	293:

	Invalid work group size, code -54
	/Users/jlu/torch/install/bin/luajit: /Users/jlu/torch/install/share/lua/5.1/clnn/LogSoftMax.lua:41:
	kernel source:
	1: // Threads per thread block
	2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
	3:
	4: inline float modifyOp(float _in1) {
	5: float _out;
	6: float *in1 = &_in1;
	7: float *out = &_out;
	8: out = in1;
	9: return _out;
	10: }
	11:
	12: inline float reduceOp(float _in1, float _in2) {
	13: // I guess the compiler can sort this stuff out :-P
	14: float _out;
	15: float *in1 = &_in1;
	16: float *in2 = &_in2;
	17: float *out = &_out;
	18: out = in1 + *in2;
	19: return _out;
	20: }
	21:
	22: // kernel argument that defines tensor layout
	23: typedef struct TensorInfoCl {
	24: // Extracts size/stride information for the kernel.
	25: // Successive dimensions can be collapsed if the size/strides match
	26: // up and thus there are no holes between the dimensions. This is used
	27: // to reduce the complexity of the problem.
	28: // The optional `reduceDim` indicates a reduction dimension for the
	29: // given tensor, so that the output size for this dimension will be 1.
	30:
	31: int sizes[25];
	32: int strides[25];
	33: int offset;
	34: int dims;
	35: } TensorInfoCl;
	36: // Contiguous tensors of more than one dimension are collapsed down
	37: // to one tensor
	38:
	39:
	40: // Translate a linear index for the apply to a float* offset;
	41: // specialized on `Dims` to reduce nvcc compilation time
	42:
	43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) {
	44: int offset = info->offset;
	45:
	46: // Use static dims
	47: // for (int i = 1 - 1; i >= 0; --i) {
	48: int curDimIndex;
	49: int curDimOffset;
	50: // bake this in....
	51: curDimIndex = linearId 36263448nfo->sizes[0];
	52: curDimOffset = curDimIndex * info->strides[0];
	53: offset += curDimOffset;
	54:
	55:
	56:
	57: // }
	58:
	59: return offset;
	60: }
	61:
	62:
	63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) {
	64: return linearId + info->offset;
	65: }
	66:
	67: inline int IndexToOffset_999_get(int linearId, global const TensorInfoCl *info
	stack traceback:
	[C]: in function 'sum'
	/Users/jlu/torch/install/share/lua/5.1/clnn/LogSoftMax.lua:41: in function 'updateOutput'
	/Users/jlu/torch/install/share/lua/5.1/nn/Sequential.lua:44: in function 'updateOutput'
	/Users/jlu/torch/install/share/lua/5.1/rnn/Recursor.lua:27: in function 'updateOutput'
	/Users/jlu/torch/install/share/lua/5.1/rnn/Sequencer.lua:59: in function 'updateOutput'
	/Users/jlu/torch/install/share/lua/5.1/nn/Sequential.lua:44: in function 'forward'
	a0.lua:54: in main chunk
	[C]: in function 'dofile'
	.../jlu/torch/install/lib/luarocks/rocks/trepl/scm-1/bin/th:145: in main chunk
	[C]: at 0x01032c0be0