Skip to content

Instantly share code, notes, and snippets.

@coodoo
Created March 27, 2016 01:18
Show Gist options
  • Save coodoo/f56c275786b391f78372 to your computer and use it in GitHub Desktop.
Save coodoo/f56c275786b391f78372 to your computer and use it in GitHub Desktop.
-- simple case to reproduce GPU crash bug
require 'paths'
require 'rnn'
require 'nngraph'
require 'cltorch'
require 'clnn'
cltorch.setDevice(1)
------------------------------------------------------------------------------------
-- model
------------------------------------------------------------------------------------
local lm = nn.Sequential()
local lookup = nn.LookupTable( 10000, 200 )
lookup.maxnormout = -1
lm:add( lookup )
lm:add( nn.SplitTable(1) )
-- rnn layer
local stepmodule = nn.Sequential()
stepmodule:add( nn.FastLSTM( 200, 200 ) )
stepmodule:add( nn.Linear(200, 10000) )
stepmodule:add(nn.LogSoftMax())
lm:add(nn.Sequencer(stepmodule))
lm:remember( 'both' )
------------------------------------------------------------------------------------
-- test case
------------------------------------------------------------------------------------
-- when set to false, ran fine on CPU
-- when set to true, crashes GPU
local useGPU = true
local inputs = torch.randn(5, 32):fill(1)
if useGPU == true then
lm:cl()
inputs = inputs:cl()
end
local outputs = lm:forward( inputs )
print(outputs)
-- switch to evaluate mode
lm:evaluate()
-- change dimension from [5x32] to [5x2], this will crash GPU, see error message at the end
inputs = torch.randn(5, 2):fill(2)
print('\n\nnew dimension')
outputs = lm:forward( inputs )
print(outputs)
--[[
Invalid work group size, code -54
/Users/jlu/torch/install/bin/luajit: /Users/jlu/torch/install/share/lua/5.1/clnn/LogSoftMax.lua:41:
kernel source:
1: // Threads per thread block
2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
3:
]]--
$ th a0.lua
libthclnn_searchpath /Users/jlu/torch/install/lib/lua/5.1/libTHCLNN.so
Using Apple , OpenCL platform: Apple
Using OpenCL device: Iris
{
1 : ClTensor - size: 32x10000
2 : ClTensor - size: 32x10000
3 : ClTensor - size: 32x10000
4 : ClTensor - size: 32x10000
5 : ClTensor - size: 32x10000
}
new dimension
kernel source:
1: // Threads per thread block
2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
3:
4: inline float modifyOp(float _in1) {
5: float _out;
6: float *in1 = &_in1;
7: float *out = &_out;
8: *out = *in1;
9: return _out;
10: }
11:
12: inline float reduceOp(float _in1, float _in2) {
13: // I guess the compiler can sort this stuff out :-P
14: float _out;
15: float *in1 = &_in1;
16: float *in2 = &_in2;
17: float *out = &_out;
18: *out = *in1 + *in2;
19: return _out;
20: }
21:
22: // kernel argument that defines tensor layout
23: typedef struct TensorInfoCl {
24: // Extracts size/stride information for the kernel.
25: // Successive dimensions can be collapsed if the size/strides match
26: // up and thus there are no holes between the dimensions. This is used
27: // to reduce the complexity of the problem.
28: // The optional `reduceDim` indicates a reduction dimension for the
29: // given tensor, so that the output size for this dimension will be 1.
30:
31: int sizes[25];
32: int strides[25];
33: int offset;
34: int dims;
35: } TensorInfoCl;
36: // Contiguous tensors of more than one dimension are collapsed down
37: // to one tensor
38:
39:
40: // Translate a linear index for the apply to a float* offset;
41: // specialized on `Dims` to reduce nvcc compilation time
42:
43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) {
44: int offset = info->offset;
45:
46: // Use static dims
47: // for (int i = 1 - 1; i >= 0; --i) {
48: int curDimIndex;
49: int curDimOffset;
50: // bake this in....
51: curDimIndex = linearId % info->sizes[0];
52: curDimOffset = curDimIndex * info->strides[0];
53: offset += curDimOffset;
54:
55:
56:
57: // }
58:
59: return offset;
60: }
61:
62:
63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) {
64: return linearId + info->offset;
65: }
66:
67: inline int IndexToOffset_999_get(int linearId, global const TensorInfoCl *info) {
68: int offset = info->offset;
69:
70: // Use dynamic dims
71: for (int i = info->dims - 1; i >= 0; --i) {
72: int curDimIndex = linearId % info->sizes[i];
73: int curDimOffset = curDimIndex * info->strides[i];
74: offset += curDimOffset;
75:
76: linearId /= info->sizes[i];
77: }
78:
79: return offset;
80: }
81:
82: inline int getLinearBlockId() {
83: return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
84: get_group_id(1) * get_num_groups(0) +
85: get_group_id(0);
86: }
87:
88: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will
89: // return the reduced value
90:
91: inline float reduceBlock( local float* smem,
92: int numVals,
93: float threadVal,
94: float init) {
95: if (numVals == 0) {
96: return init;
97: }
98:
99: if ((int)get_local_id(0) < numVals) {
100: smem[ get_local_id(0)] = threadVal;
101: }
102:
103: // First warp will perform reductions across warps
104: barrier(CLK_LOCAL_MEM_FENCE);
105: if ((get_local_id(0) / 32) == 0) {
106: float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init;
107:
108: for (int i = 32 + get_local_id(0); i < numVals; i += 32) {
109: r = reduceOp(r, smem[i]);
110: }
111:
112: smem[get_local_id(0)] = r;
113: }
114:
115: // First thread will perform reductions across the block
116: barrier(CLK_LOCAL_MEM_FENCE);
117:
118: float r = init;
119: if (get_local_id(0) == 0) {
120: r = smem[0];
121:
122: int numLanesParticipating = min(numVals, 32);
123:
124: if (numLanesParticipating == 32) {
125: // Unroll for 32 == 32 and numVals >= 32
126: // #pragma unroll
127: // unrolling by hand, so compiler-independent
128:
129: r = reduceOp(r, smem[1]);
130:
131: r = reduceOp(r, smem[2]);
132:
133: r = reduceOp(r, smem[3]);
134:
135: r = reduceOp(r, smem[4]);
136:
137: r = reduceOp(r, smem[5]);
138:
139: r = reduceOp(r, smem[6]);
140:
141: r = reduceOp(r, smem[7]);
142:
143: r = reduceOp(r, smem[8]);
144:
145: r = reduceOp(r, smem[9]);
146:
147: r = reduceOp(r, smem[10]);
148:
149: r = reduceOp(r, smem[11]);
150:
151: r = reduceOp(r, smem[12]);
152:
153: r = reduceOp(r, smem[13]);
154:
155: r = reduceOp(r, smem[14]);
156:
157: r = reduceOp(r, smem[15]);
158:
159: r = reduceOp(r, smem[16]);
160:
161: r = reduceOp(r, smem[17]);
162:
163: r = reduceOp(r, smem[18]);
164:
165: r = reduceOp(r, smem[19]);
166:
167: r = reduceOp(r, smem[20]);
168:
169: r = reduceOp(r, smem[21]);
170:
171: r = reduceOp(r, smem[22]);
172:
173: r = reduceOp(r, smem[23]);
174:
175: r = reduceOp(r, smem[24]);
176:
177: r = reduceOp(r, smem[25]);
178:
179: r = reduceOp(r, smem[26]);
180:
181: r = reduceOp(r, smem[27]);
182:
183: r = reduceOp(r, smem[28]);
184:
185: r = reduceOp(r, smem[29]);
186:
187: r = reduceOp(r, smem[30]);
188:
189: r = reduceOp(r, smem[31]);
190:
191: } else {
192: for (int i = 1; i < numLanesParticipating; ++i) {
193: r = reduceOp(r, smem[i]);
194: }
195: }
196: }
197:
198: return r;
199: }
200:
201:
202:
203:
204: inline int getReduceNoncontigDimSliceIndex() {
205: // Each thread handles one slice
206: return getLinearBlockId() * THCL_NONCONTIG_REDUCE_BLOCK_SIZE + /*threadIdx.x*/ get_local_id(0);
207: }
208:
209: // Kernel that handles an entire reduction of a slice of a tensor per each thread
210: kernel void
211: THClTensor_reduceNoncontigDim(global TensorInfoCl *out_info,
212: global float *out_data,
213: global TensorInfoCl *in_info,
214: global float *in_data,
215: int reductionStride,
216: int reductionSize,
217: int totalSlices,
218: float init) {
219: const int sliceIndex = getReduceNoncontigDimSliceIndex();
220:
221: if ((int)sliceIndex >= totalSlices) {
222: return;
223: }
224:
225: // Each thread picks a point in `out` and `in` for which it is
226: // producing the reduction
227: const int outOffset =
228: IndexToOffset_998_get(sliceIndex, &out_info[0]);
229: const int inBaseOffset =
230: IndexToOffset_1001_get(sliceIndex, &in_info[0]);
231:
232: // For each point in reductionSize, reduce into `r`
233: int inOffset = inBaseOffset;
234: float r = init;
235:
236: for (int i = 0; (int)i < reductionSize; ++i) {
237: r = reduceOp(r, modifyOp(in_data[inOffset]));
238: inOffset += reductionStride;
239: }
240:
241: // Write out reduced value
242: out_data[outOffset] = r;
243: }
244:
245: inline int getReduceContigDimSliceIndex() {
246: // Each block handles one slice
247: return getLinearBlockId();
248: }
249:
250: // Kernel that handles an entire reduction of a slice of a tensor per
251: // each block
252: kernel void
253: THClTensor_reduceContigDim(global TensorInfoCl *out_info,
254: global float *out_data,
255: global TensorInfoCl *in_info,
256: global float *in_data,
257: int reductionSize,
258: int totalSlices,
259: float init,
260: local float *smem) {
261: const int sliceIndex = getReduceContigDimSliceIndex();
262:
263: if ((int)sliceIndex >= totalSlices) {
264: return;
265: }
266:
267: // Get the offset in `out` for the reduction
268: const int outOffset =
269: IndexToOffset_998_get(sliceIndex, &out_info[0]);
270:
271: // Get the base offset in `in` for this block's reduction
272: const int inBaseOffset =
273: IndexToOffset_1001_get(sliceIndex, &in_info[0]);
274:
275: // Each thread in the block will reduce some subset of elements in
276: // the slice. The elements are guaranteed contiguous starting at
277: // `inBaseOffset`.
278: float r = init;
279: for (int i = /*threadIdx.x*/ get_local_id(0); (int)i < reductionSize; i += /*blockDim.x*/ get_local_size(0)) {
280: r = reduceOp(r, modifyOp(in_data[inBaseOffset + i]));
281: }
282:
283: // Reduce within the block
284: // extern __shared__ float smem[];
285: r = reduceBlock(smem, /*blockDim.x*/ get_local_size(0), r, init);
286:
287: if (/*threadIdx.x*/ get_local_id(0) == 0) {
288: // Write out reduced value
289: out_data[outOffset] = r;
290: }
291: }
292:
293:
Invalid work group size, code -54
/Users/jlu/torch/install/bin/luajit: /Users/jlu/torch/install/share/lua/5.1/clnn/LogSoftMax.lua:41:
kernel source:
1: // Threads per thread block
2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
3:
4: inline float modifyOp(float _in1) {
5: float _out;
6: float *in1 = &_in1;
7: float *out = &_out;
8: *out = *in1;
9: return _out;
10: }
11:
12: inline float reduceOp(float _in1, float _in2) {
13: // I guess the compiler can sort this stuff out :-P
14: float _out;
15: float *in1 = &_in1;
16: float *in2 = &_in2;
17: float *out = &_out;
18: *out = *in1 + *in2;
19: return _out;
20: }
21:
22: // kernel argument that defines tensor layout
23: typedef struct TensorInfoCl {
24: // Extracts size/stride information for the kernel.
25: // Successive dimensions can be collapsed if the size/strides match
26: // up and thus there are no holes between the dimensions. This is used
27: // to reduce the complexity of the problem.
28: // The optional `reduceDim` indicates a reduction dimension for the
29: // given tensor, so that the output size for this dimension will be 1.
30:
31: int sizes[25];
32: int strides[25];
33: int offset;
34: int dims;
35: } TensorInfoCl;
36: // Contiguous tensors of more than one dimension are collapsed down
37: // to one tensor
38:
39:
40: // Translate a linear index for the apply to a float* offset;
41: // specialized on `Dims` to reduce nvcc compilation time
42:
43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) {
44: int offset = info->offset;
45:
46: // Use static dims
47: // for (int i = 1 - 1; i >= 0; --i) {
48: int curDimIndex;
49: int curDimOffset;
50: // bake this in....
51: curDimIndex = linearId 36263448nfo->sizes[0];
52: curDimOffset = curDimIndex * info->strides[0];
53: offset += curDimOffset;
54:
55:
56:
57: // }
58:
59: return offset;
60: }
61:
62:
63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) {
64: return linearId + info->offset;
65: }
66:
67: inline int IndexToOffset_999_get(int linearId, global const TensorInfoCl *info
stack traceback:
[C]: in function 'sum'
/Users/jlu/torch/install/share/lua/5.1/clnn/LogSoftMax.lua:41: in function 'updateOutput'
/Users/jlu/torch/install/share/lua/5.1/nn/Sequential.lua:44: in function 'updateOutput'
/Users/jlu/torch/install/share/lua/5.1/rnn/Recursor.lua:27: in function 'updateOutput'
/Users/jlu/torch/install/share/lua/5.1/rnn/Sequencer.lua:59: in function 'updateOutput'
/Users/jlu/torch/install/share/lua/5.1/nn/Sequential.lua:44: in function 'forward'
a0.lua:54: in main chunk
[C]: in function 'dofile'
.../jlu/torch/install/lib/luarocks/rocks/trepl/scm-1/bin/th:145: in main chunk
[C]: at 0x01032c0be0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment