Created
March 27, 2016 01:18
-
-
Save coodoo/f56c275786b391f78372 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- simple case to reproduce GPU crash bug | |
require 'paths' | |
require 'rnn' | |
require 'nngraph' | |
require 'cltorch' | |
require 'clnn' | |
cltorch.setDevice(1) | |
------------------------------------------------------------------------------------ | |
-- model | |
------------------------------------------------------------------------------------ | |
local lm = nn.Sequential() | |
local lookup = nn.LookupTable( 10000, 200 ) | |
lookup.maxnormout = -1 | |
lm:add( lookup ) | |
lm:add( nn.SplitTable(1) ) | |
-- rnn layer | |
local stepmodule = nn.Sequential() | |
stepmodule:add( nn.FastLSTM( 200, 200 ) ) | |
stepmodule:add( nn.Linear(200, 10000) ) | |
stepmodule:add(nn.LogSoftMax()) | |
lm:add(nn.Sequencer(stepmodule)) | |
lm:remember( 'both' ) | |
------------------------------------------------------------------------------------ | |
-- test case | |
------------------------------------------------------------------------------------ | |
-- when set to false, ran fine on CPU | |
-- when set to true, crashes GPU | |
local useGPU = true | |
local inputs = torch.randn(5, 32):fill(1) | |
if useGPU == true then | |
lm:cl() | |
inputs = inputs:cl() | |
end | |
local outputs = lm:forward( inputs ) | |
print(outputs) | |
-- switch to evaluate mode | |
lm:evaluate() | |
-- change dimension from [5x32] to [5x2], this will crash GPU, see error message at the end | |
inputs = torch.randn(5, 2):fill(2) | |
print('\n\nnew dimension') | |
outputs = lm:forward( inputs ) | |
print(outputs) | |
--[[ | |
Invalid work group size, code -54 | |
/Users/jlu/torch/install/bin/luajit: /Users/jlu/torch/install/share/lua/5.1/clnn/LogSoftMax.lua:41: | |
kernel source: | |
1: // Threads per thread block | |
2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16 | |
3: | |
]]-- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ th a0.lua | |
libthclnn_searchpath /Users/jlu/torch/install/lib/lua/5.1/libTHCLNN.so | |
Using Apple , OpenCL platform: Apple | |
Using OpenCL device: Iris | |
{ | |
1 : ClTensor - size: 32x10000 | |
2 : ClTensor - size: 32x10000 | |
3 : ClTensor - size: 32x10000 | |
4 : ClTensor - size: 32x10000 | |
5 : ClTensor - size: 32x10000 | |
} | |
new dimension | |
kernel source: | |
1: // Threads per thread block | |
2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16 | |
3: | |
4: inline float modifyOp(float _in1) { | |
5: float _out; | |
6: float *in1 = &_in1; | |
7: float *out = &_out; | |
8: *out = *in1; | |
9: return _out; | |
10: } | |
11: | |
12: inline float reduceOp(float _in1, float _in2) { | |
13: // I guess the compiler can sort this stuff out :-P | |
14: float _out; | |
15: float *in1 = &_in1; | |
16: float *in2 = &_in2; | |
17: float *out = &_out; | |
18: *out = *in1 + *in2; | |
19: return _out; | |
20: } | |
21: | |
22: // kernel argument that defines tensor layout | |
23: typedef struct TensorInfoCl { | |
24: // Extracts size/stride information for the kernel. | |
25: // Successive dimensions can be collapsed if the size/strides match | |
26: // up and thus there are no holes between the dimensions. This is used | |
27: // to reduce the complexity of the problem. | |
28: // The optional `reduceDim` indicates a reduction dimension for the | |
29: // given tensor, so that the output size for this dimension will be 1. | |
30: | |
31: int sizes[25]; | |
32: int strides[25]; | |
33: int offset; | |
34: int dims; | |
35: } TensorInfoCl; | |
36: // Contiguous tensors of more than one dimension are collapsed down | |
37: // to one tensor | |
38: | |
39: | |
40: // Translate a linear index for the apply to a float* offset; | |
41: // specialized on `Dims` to reduce nvcc compilation time | |
42: | |
43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) { | |
44: int offset = info->offset; | |
45: | |
46: // Use static dims | |
47: // for (int i = 1 - 1; i >= 0; --i) { | |
48: int curDimIndex; | |
49: int curDimOffset; | |
50: // bake this in.... | |
51: curDimIndex = linearId % info->sizes[0]; | |
52: curDimOffset = curDimIndex * info->strides[0]; | |
53: offset += curDimOffset; | |
54: | |
55: | |
56: | |
57: // } | |
58: | |
59: return offset; | |
60: } | |
61: | |
62: | |
63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) { | |
64: return linearId + info->offset; | |
65: } | |
66: | |
67: inline int IndexToOffset_999_get(int linearId, global const TensorInfoCl *info) { | |
68: int offset = info->offset; | |
69: | |
70: // Use dynamic dims | |
71: for (int i = info->dims - 1; i >= 0; --i) { | |
72: int curDimIndex = linearId % info->sizes[i]; | |
73: int curDimOffset = curDimIndex * info->strides[i]; | |
74: offset += curDimOffset; | |
75: | |
76: linearId /= info->sizes[i]; | |
77: } | |
78: | |
79: return offset; | |
80: } | |
81: | |
82: inline int getLinearBlockId() { | |
83: return get_group_id(2) * get_num_groups(1) * get_num_groups(0) + | |
84: get_group_id(1) * get_num_groups(0) + | |
85: get_group_id(0); | |
86: } | |
87: | |
88: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will | |
89: // return the reduced value | |
90: | |
91: inline float reduceBlock( local float* smem, | |
92: int numVals, | |
93: float threadVal, | |
94: float init) { | |
95: if (numVals == 0) { | |
96: return init; | |
97: } | |
98: | |
99: if ((int)get_local_id(0) < numVals) { | |
100: smem[ get_local_id(0)] = threadVal; | |
101: } | |
102: | |
103: // First warp will perform reductions across warps | |
104: barrier(CLK_LOCAL_MEM_FENCE); | |
105: if ((get_local_id(0) / 32) == 0) { | |
106: float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init; | |
107: | |
108: for (int i = 32 + get_local_id(0); i < numVals; i += 32) { | |
109: r = reduceOp(r, smem[i]); | |
110: } | |
111: | |
112: smem[get_local_id(0)] = r; | |
113: } | |
114: | |
115: // First thread will perform reductions across the block | |
116: barrier(CLK_LOCAL_MEM_FENCE); | |
117: | |
118: float r = init; | |
119: if (get_local_id(0) == 0) { | |
120: r = smem[0]; | |
121: | |
122: int numLanesParticipating = min(numVals, 32); | |
123: | |
124: if (numLanesParticipating == 32) { | |
125: // Unroll for 32 == 32 and numVals >= 32 | |
126: // #pragma unroll | |
127: // unrolling by hand, so compiler-independent | |
128: | |
129: r = reduceOp(r, smem[1]); | |
130: | |
131: r = reduceOp(r, smem[2]); | |
132: | |
133: r = reduceOp(r, smem[3]); | |
134: | |
135: r = reduceOp(r, smem[4]); | |
136: | |
137: r = reduceOp(r, smem[5]); | |
138: | |
139: r = reduceOp(r, smem[6]); | |
140: | |
141: r = reduceOp(r, smem[7]); | |
142: | |
143: r = reduceOp(r, smem[8]); | |
144: | |
145: r = reduceOp(r, smem[9]); | |
146: | |
147: r = reduceOp(r, smem[10]); | |
148: | |
149: r = reduceOp(r, smem[11]); | |
150: | |
151: r = reduceOp(r, smem[12]); | |
152: | |
153: r = reduceOp(r, smem[13]); | |
154: | |
155: r = reduceOp(r, smem[14]); | |
156: | |
157: r = reduceOp(r, smem[15]); | |
158: | |
159: r = reduceOp(r, smem[16]); | |
160: | |
161: r = reduceOp(r, smem[17]); | |
162: | |
163: r = reduceOp(r, smem[18]); | |
164: | |
165: r = reduceOp(r, smem[19]); | |
166: | |
167: r = reduceOp(r, smem[20]); | |
168: | |
169: r = reduceOp(r, smem[21]); | |
170: | |
171: r = reduceOp(r, smem[22]); | |
172: | |
173: r = reduceOp(r, smem[23]); | |
174: | |
175: r = reduceOp(r, smem[24]); | |
176: | |
177: r = reduceOp(r, smem[25]); | |
178: | |
179: r = reduceOp(r, smem[26]); | |
180: | |
181: r = reduceOp(r, smem[27]); | |
182: | |
183: r = reduceOp(r, smem[28]); | |
184: | |
185: r = reduceOp(r, smem[29]); | |
186: | |
187: r = reduceOp(r, smem[30]); | |
188: | |
189: r = reduceOp(r, smem[31]); | |
190: | |
191: } else { | |
192: for (int i = 1; i < numLanesParticipating; ++i) { | |
193: r = reduceOp(r, smem[i]); | |
194: } | |
195: } | |
196: } | |
197: | |
198: return r; | |
199: } | |
200: | |
201: | |
202: | |
203: | |
204: inline int getReduceNoncontigDimSliceIndex() { | |
205: // Each thread handles one slice | |
206: return getLinearBlockId() * THCL_NONCONTIG_REDUCE_BLOCK_SIZE + /*threadIdx.x*/ get_local_id(0); | |
207: } | |
208: | |
209: // Kernel that handles an entire reduction of a slice of a tensor per each thread | |
210: kernel void | |
211: THClTensor_reduceNoncontigDim(global TensorInfoCl *out_info, | |
212: global float *out_data, | |
213: global TensorInfoCl *in_info, | |
214: global float *in_data, | |
215: int reductionStride, | |
216: int reductionSize, | |
217: int totalSlices, | |
218: float init) { | |
219: const int sliceIndex = getReduceNoncontigDimSliceIndex(); | |
220: | |
221: if ((int)sliceIndex >= totalSlices) { | |
222: return; | |
223: } | |
224: | |
225: // Each thread picks a point in `out` and `in` for which it is | |
226: // producing the reduction | |
227: const int outOffset = | |
228: IndexToOffset_998_get(sliceIndex, &out_info[0]); | |
229: const int inBaseOffset = | |
230: IndexToOffset_1001_get(sliceIndex, &in_info[0]); | |
231: | |
232: // For each point in reductionSize, reduce into `r` | |
233: int inOffset = inBaseOffset; | |
234: float r = init; | |
235: | |
236: for (int i = 0; (int)i < reductionSize; ++i) { | |
237: r = reduceOp(r, modifyOp(in_data[inOffset])); | |
238: inOffset += reductionStride; | |
239: } | |
240: | |
241: // Write out reduced value | |
242: out_data[outOffset] = r; | |
243: } | |
244: | |
245: inline int getReduceContigDimSliceIndex() { | |
246: // Each block handles one slice | |
247: return getLinearBlockId(); | |
248: } | |
249: | |
250: // Kernel that handles an entire reduction of a slice of a tensor per | |
251: // each block | |
252: kernel void | |
253: THClTensor_reduceContigDim(global TensorInfoCl *out_info, | |
254: global float *out_data, | |
255: global TensorInfoCl *in_info, | |
256: global float *in_data, | |
257: int reductionSize, | |
258: int totalSlices, | |
259: float init, | |
260: local float *smem) { | |
261: const int sliceIndex = getReduceContigDimSliceIndex(); | |
262: | |
263: if ((int)sliceIndex >= totalSlices) { | |
264: return; | |
265: } | |
266: | |
267: // Get the offset in `out` for the reduction | |
268: const int outOffset = | |
269: IndexToOffset_998_get(sliceIndex, &out_info[0]); | |
270: | |
271: // Get the base offset in `in` for this block's reduction | |
272: const int inBaseOffset = | |
273: IndexToOffset_1001_get(sliceIndex, &in_info[0]); | |
274: | |
275: // Each thread in the block will reduce some subset of elements in | |
276: // the slice. The elements are guaranteed contiguous starting at | |
277: // `inBaseOffset`. | |
278: float r = init; | |
279: for (int i = /*threadIdx.x*/ get_local_id(0); (int)i < reductionSize; i += /*blockDim.x*/ get_local_size(0)) { | |
280: r = reduceOp(r, modifyOp(in_data[inBaseOffset + i])); | |
281: } | |
282: | |
283: // Reduce within the block | |
284: // extern __shared__ float smem[]; | |
285: r = reduceBlock(smem, /*blockDim.x*/ get_local_size(0), r, init); | |
286: | |
287: if (/*threadIdx.x*/ get_local_id(0) == 0) { | |
288: // Write out reduced value | |
289: out_data[outOffset] = r; | |
290: } | |
291: } | |
292: | |
293: | |
Invalid work group size, code -54 | |
/Users/jlu/torch/install/bin/luajit: /Users/jlu/torch/install/share/lua/5.1/clnn/LogSoftMax.lua:41: | |
kernel source: | |
1: // Threads per thread block | |
2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16 | |
3: | |
4: inline float modifyOp(float _in1) { | |
5: float _out; | |
6: float *in1 = &_in1; | |
7: float *out = &_out; | |
8: *out = *in1; | |
9: return _out; | |
10: } | |
11: | |
12: inline float reduceOp(float _in1, float _in2) { | |
13: // I guess the compiler can sort this stuff out :-P | |
14: float _out; | |
15: float *in1 = &_in1; | |
16: float *in2 = &_in2; | |
17: float *out = &_out; | |
18: *out = *in1 + *in2; | |
19: return _out; | |
20: } | |
21: | |
22: // kernel argument that defines tensor layout | |
23: typedef struct TensorInfoCl { | |
24: // Extracts size/stride information for the kernel. | |
25: // Successive dimensions can be collapsed if the size/strides match | |
26: // up and thus there are no holes between the dimensions. This is used | |
27: // to reduce the complexity of the problem. | |
28: // The optional `reduceDim` indicates a reduction dimension for the | |
29: // given tensor, so that the output size for this dimension will be 1. | |
30: | |
31: int sizes[25]; | |
32: int strides[25]; | |
33: int offset; | |
34: int dims; | |
35: } TensorInfoCl; | |
36: // Contiguous tensors of more than one dimension are collapsed down | |
37: // to one tensor | |
38: | |
39: | |
40: // Translate a linear index for the apply to a float* offset; | |
41: // specialized on `Dims` to reduce nvcc compilation time | |
42: | |
43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) { | |
44: int offset = info->offset; | |
45: | |
46: // Use static dims | |
47: // for (int i = 1 - 1; i >= 0; --i) { | |
48: int curDimIndex; | |
49: int curDimOffset; | |
50: // bake this in.... | |
51: curDimIndex = linearId 36263448nfo->sizes[0]; | |
52: curDimOffset = curDimIndex * info->strides[0]; | |
53: offset += curDimOffset; | |
54: | |
55: | |
56: | |
57: // } | |
58: | |
59: return offset; | |
60: } | |
61: | |
62: | |
63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) { | |
64: return linearId + info->offset; | |
65: } | |
66: | |
67: inline int IndexToOffset_999_get(int linearId, global const TensorInfoCl *info | |
stack traceback: | |
[C]: in function 'sum' | |
/Users/jlu/torch/install/share/lua/5.1/clnn/LogSoftMax.lua:41: in function 'updateOutput' | |
/Users/jlu/torch/install/share/lua/5.1/nn/Sequential.lua:44: in function 'updateOutput' | |
/Users/jlu/torch/install/share/lua/5.1/rnn/Recursor.lua:27: in function 'updateOutput' | |
/Users/jlu/torch/install/share/lua/5.1/rnn/Sequencer.lua:59: in function 'updateOutput' | |
/Users/jlu/torch/install/share/lua/5.1/nn/Sequential.lua:44: in function 'forward' | |
a0.lua:54: in main chunk | |
[C]: in function 'dofile' | |
.../jlu/torch/install/lib/luarocks/rocks/trepl/scm-1/bin/th:145: in main chunk | |
[C]: at 0x01032c0be0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment