Created
February 25, 2016 10:39
-
-
Save luan-cestari/494cbae6a3586be94402 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
~/torch ⮀ master ⮀ 7d17h28m ⮀ uname -a ⮂ ruby-2.3.0 | |
Linux localhost.localdomain 4.3.5-300.fc23.x86_64 #1 SMP Mon Feb 1 03:18:41 UTC 2016 x86_64 x86_64 x86_64 GNU/Linux | |
~/torch ⮀ master ⮀ 7d17h25m ⮀ clinfo ⮂ ruby-2.3.0 | |
Number of platforms 1 | |
Platform Name Clover | |
Platform Vendor Mesa | |
Platform Version OpenCL 1.1 MESA 11.1.0 | |
Platform Profile FULL_PROFILE | |
Platform Extensions cl_khr_icd | |
Platform Extensions function suffix MESA | |
Platform Name Clover | |
Number of devices 1 | |
Device Name AMD HAWAII (DRM 2.43.0, LLVM 3.7.0) | |
Device Vendor AMD | |
Device Vendor ID 0x1002 | |
Device Version OpenCL 1.1 MESA 11.1.0 | |
Driver Version 11.1.0 | |
Device OpenCL C Version OpenCL C 1.1 | |
Device Type GPU | |
Device Profile FULL_PROFILE | |
Max compute units 40 | |
Max clock frequency 1020MHz | |
Max work item dimensions 3 | |
Max work item sizes 256x256x256 | |
Max work group size 256 | |
Preferred work group size multiple 64 | |
Preferred / native vector sizes | |
char 16 / 16 | |
short 8 / 8 | |
int 4 / 4 | |
long 2 / 2 | |
half 0 / 0 (n/a) | |
float 4 / 4 | |
double 2 / 2 (cl_khr_fp64) | |
Half-precision Floating-point support (n/a) | |
Single-precision Floating-point support (core) | |
Denormals No | |
Infinity and NANs Yes | |
Round to nearest Yes | |
Round to zero No | |
Round to infinity No | |
IEEE754-2008 fused multiply-add No | |
Support is emulated in software No | |
Correctly-rounded divide and sqrt operations No | |
Double-precision Floating-point support (cl_khr_fp64) | |
Denormals Yes | |
Infinity and NANs Yes | |
Round to nearest Yes | |
Round to zero Yes | |
Round to infinity Yes | |
IEEE754-2008 fused multiply-add Yes | |
Support is emulated in software No | |
Correctly-rounded divide and sqrt operations No | |
Address bits 32, Little-Endian | |
Global memory size 1073741824 (1024MiB) | |
Error Correction support No | |
Max memory allocation 268435456 (256MiB) | |
Unified memory for Host and Device Yes | |
Minimum alignment for any data type 128 bytes | |
Alignment of base address 1024 bits (128 bytes) | |
Global Memory cache type None | |
Image support No | |
Local memory type Local | |
Local memory size 32768 (32KiB) | |
Max constant buffer size 268435456 (256MiB) | |
Max number of constant args 16 | |
Max size of kernel argument 1024 | |
Queue properties | |
Out-of-order execution No | |
Profiling Yes | |
Profiling timer resolution 0ns | |
Execution capabilities | |
Run OpenCL kernels Yes | |
Run native kernels No | |
Device Available Yes | |
Compiler Available Yes | |
Device Extensions cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_fp64 | |
NULL platform behavior | |
clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) Clover | |
clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) Success [MESA] | |
clCreateContext(NULL, ...) [default] Success [MESA] | |
clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No devices found in platform | |
clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) Success (1) | |
Platform Name Clover | |
Device Name AMD HAWAII (DRM 2.43.0, LLVM 3.7.0) | |
clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No devices found in platform | |
clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No devices found in platform | |
clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) Success (1) | |
Platform Name Clover | |
Device Name AMD HAWAII (DRM 2.43.0, LLVM 3.7.0) | |
ICD loader properties | |
ICD loader Name OpenCL ICD Loader | |
ICD loader Vendor OCL Icd free software | |
ICD loader Version 2.2.7 | |
ICD loader Profile OpenCL 1.2 | |
~/torch ⮀ master ⮀ 7d17h25m ⮀ luajit -l clnn -e 'clnn.test()' ⮂ ruby-2.3.0 | |
libthclnn_searchpath /home/lcestari/torch/install/lib/lua/5.1/libTHCLNN.so | |
Running 68 tests | |
|___________________________________________________________________ ==> Abs_backwardUsing Mesa , OpenCL platform: Clover | |
Using OpenCL device: AMD HAWAII (DRM 2.43.0, LLVM 3.7.0) | |
____|_______________________________________________________________ ==> ClassNLLCriterionMultipleTargetTHClReduceAll.cl build log: | |
unsupported call to function reduceBlock in THClTensor_reduceAll | |
kernel build error: | |
kernel source: | |
1: inline unsigned int THClCeilDiv(unsigned int a, unsigned int b) { | |
2: return (a + b - 1) / b; | |
3: } | |
4: | |
5: | |
6: | |
7: inline float modifyOp(float _in1) { | |
8: float _out; | |
9: float *in1 = &_in1; | |
10: float *out = &_out; | |
11: *out = *in1; | |
12: return _out; | |
13: } | |
14: | |
15: inline float reduceOp(float _in1, float _in2) { | |
16: // I guess the compiler can sort this stuff out :-P | |
17: float _out; | |
18: float *in1 = &_in1; | |
19: float *in2 = &_in2; | |
20: float *out = &_out; | |
21: *out = *in1 + *in2; | |
22: return _out; | |
23: } | |
24: | |
25: // kernel argument that defines tensor layout | |
26: typedef struct TensorInfoCl { | |
27: // Extracts size/stride information for the kernel. | |
28: // Successive dimensions can be collapsed if the size/strides match | |
29: // up and thus there are no holes between the dimensions. This is used | |
30: // to reduce the complexity of the problem. | |
31: // The optional `reduceDim` indicates a reduction dimension for the | |
32: // given tensor, so that the output size for this dimension will be 1. | |
33: | |
34: unsigned int sizes[25]; | |
35: unsigned int strides[25]; | |
36: unsigned int offset; | |
37: int dims; | |
38: } TensorInfoCl; | |
39: // Contiguous tensors of more than one dimension are collapsed down | |
40: // to one tensor | |
41: | |
42: | |
43: // Translate a linear index for the apply to a float* offset; | |
44: // specialized on `Dims` to reduce nvcc compilation time | |
45: | |
46: | |
47: inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) { | |
48: return linearId + info->offset; | |
49: } | |
50: | |
51: inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) { | |
52: unsigned int offset = info->offset; | |
53: | |
54: // Use dynamic dims | |
55: for (int i = info->dims - 1; i >= 0; --i) { | |
56: unsigned int curDimIndex = linearId % info->sizes[i]; | |
57: unsigned int curDimOffset = curDimIndex * info->strides[i]; | |
58: offset += curDimOffset; | |
59: | |
60: linearId /= info->sizes[i]; | |
61: } | |
62: | |
63: return offset; | |
64: } | |
65: | |
66: inline unsigned int getLinearBlockId() { | |
67: return get_group_id(2) * get_num_groups(1) * get_num_groups(0) + | |
68: get_group_id(1) * get_num_groups(0) + | |
69: get_group_id(0); | |
70: } | |
71: | |
72: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will | |
73: // return the reduced value | |
74: | |
75: inline float reduceBlock( local float* smem, | |
76: int numVals, | |
77: float threadVal, | |
78: float init) { | |
79: if (numVals == 0) { | |
80: return init; | |
81: } | |
82: | |
83: if ((int)get_local_id(0) < numVals) { | |
84: smem[ get_local_id(0)] = threadVal; | |
85: } | |
86: | |
87: // First warp will perform reductions across warps | |
88: barrier(CLK_LOCAL_MEM_FENCE); | |
89: if ((get_local_id(0) / 32) == 0) { | |
90: float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init; | |
91: | |
92: for (int i = 32 + get_local_id(0); i < numVals; i += 32) { | |
93: r = reduceOp(r, smem[i]); | |
94: } | |
95: | |
96: smem[get_local_id(0)] = r; | |
97: } | |
98: | |
99: // First thread will perform reductions across the block | |
100: barrier(CLK_LOCAL_MEM_FENCE); | |
101: | |
102: float r = init; | |
103: if (get_local_id(0) == 0) { | |
104: r = smem[0]; | |
105: | |
106: int numLanesParticipating = min(numVals, 32); | |
107: | |
108: if (numLanesParticipating == 32) { | |
109: // Unroll for 32 == 32 and numVals >= 32 | |
110: // #pragma unroll | |
111: // unrolling by hand, so compiler-independent | |
112: | |
113: r = reduceOp(r, smem[1]); | |
114: | |
115: r = reduceOp(r, smem[2]); | |
116: | |
117: r = reduceOp(r, smem[3]); | |
118: | |
119: r = reduceOp(r, smem[4]); | |
120: | |
121: r = reduceOp(r, smem[5]); | |
122: | |
123: r = reduceOp(r, smem[6]); | |
124: | |
125: r = reduceOp(r, smem[7]); | |
126: | |
127: r = reduceOp(r, smem[8]); | |
128: | |
129: r = reduceOp(r, smem[9]); | |
130: | |
131: r = reduceOp(r, smem[10]); | |
132: | |
133: r = reduceOp(r, smem[11]); | |
134: | |
135: r = reduceOp(r, smem[12]); | |
136: | |
137: r = reduceOp(r, smem[13]); | |
138: | |
139: r = reduceOp(r, smem[14]); | |
140: | |
141: r = reduceOp(r, smem[15]); | |
142: | |
143: r = reduceOp(r, smem[16]); | |
144: | |
145: r = reduceOp(r, smem[17]); | |
146: | |
147: r = reduceOp(r, smem[18]); | |
148: | |
149: r = reduceOp(r, smem[19]); | |
150: | |
151: r = reduceOp(r, smem[20]); | |
152: | |
153: r = reduceOp(r, smem[21]); | |
154: | |
155: r = reduceOp(r, smem[22]); | |
156: | |
157: r = reduceOp(r, smem[23]); | |
158: | |
159: r = reduceOp(r, smem[24]); | |
160: | |
161: r = reduceOp(r, smem[25]); | |
162: | |
163: r = reduceOp(r, smem[26]); | |
164: | |
165: r = reduceOp(r, smem[27]); | |
166: | |
167: r = reduceOp(r, smem[28]); | |
168: | |
169: r = reduceOp(r, smem[29]); | |
170: | |
171: r = reduceOp(r, smem[30]); | |
172: | |
173: r = reduceOp(r, smem[31]); | |
174: | |
175: } else { | |
176: for (int i = 1; i < numLanesParticipating; ++i) { | |
177: r = reduceOp(r, smem[i]); | |
178: } | |
179: } | |
180: } | |
181: | |
182: return r; | |
183: } | |
184: | |
185: | |
186: | |
187: | |
188: // Kernel that handles an entire reduction of a tensor in one pass | |
189: kernel void | |
190: THClTensor_reduceAll(global TensorInfoCl *in_info, | |
191: global float *in_data, | |
192: unsigned int totalElements, | |
193: float init, | |
194: global float* out, | |
195: local float *smem) { | |
196: // With a block-wide stride, have each thread perform its own reduction. | |
197: float r = init; | |
198: for (unsigned int i = get_local_id(0); i < totalElements; i += get_local_size(0)) { | |
199: const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]); | |
200: r = reduceOp(r, modifyOp(in_data[inOffset])); | |
201: } | |
202: | |
203: // Reduce within the block | |
204: r = reduceBlock(smem, get_local_size(0), r, init); | |
205: | |
206: if(get_local_id(0) == 0) { | |
207: // Write out reduced value | |
208: out[0] = r; | |
209: } | |
210: } | |
211: | |
212: inline unsigned int getStartIndex(unsigned int totalSize) { | |
213: unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0)); | |
214: return get_group_id(0) * sizePerBlock; | |
215: } | |
216: | |
217: inline unsigned int getEndIndex(unsigned int totalSize) { | |
218: unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0)); | |
219: return min((unsigned int) ((get_group_id(0) + 1) * sizePerBlock), totalSize); | |
220: } | |
221: | |
222: // Kernel that handles an entire reduction of a tensor in two passes | |
223: kernel void | |
224: THClTensor_reduceAllPass1(global TensorInfoCl *in_info, | |
225: global float *in_data, | |
226: unsigned int totalElements, | |
227: float init, | |
228: global float* scratchSpace, | |
229: local float *smem) { | |
230: const unsigned int startIndex = getStartIndex(totalElements); | |
231: const unsigned int endIndex = getEndIndex(totalElements); | |
232: | |
233: // With a block-wide stride, have each thread perform its own reduction. | |
234: float r = init; | |
235: for (unsigned int i = startIndex + get_local_id(0); i < endIndex; i += get_local_size(0)) { | |
236: const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]); | |
237: r = reduceOp(r, modifyOp(in_data[inOffset])); | |
238: } | |
239: | |
240: // Reduce within the block | |
241: r = reduceBlock(smem, get_local_size(0), r, init); | |
242: | |
243: if ((int)get_local_id(0) == 0) { | |
244: // Write out block-wide reduced value | |
245: scratchSpace[get_group_id(0)] = r; | |
246: } | |
247: } | |
248: | |
249: kernel void THClTensor_reduceAllPass2(int numPass1Blocks, | |
250: float init, | |
251: global float* scratchSpace, | |
252: global float* out, | |
253: local float *smem) { | |
254: float r = init; | |
255: if ((int)get_local_id(0) < numPass1Blocks) { | |
256: r = scratchSpace[get_local_id(0)]; | |
257: } | |
258: | |
259: // Reduce within the block | |
260: r = reduceBlock(smem, numPass1Blocks, r, init); | |
261: | |
262: if((int)get_local_id(0) == 0) { | |
263: out[0] = r; | |
264: } | |
265: } | |
266: | |
267: | |
268: | |
269: | |
Something went wrong with clCreateKernel, OpenCL erorr code -45 | |
THClReduceAll.cl build log: | |
unsupported call to function reduceBlock in THClTensor_reduceAll | |
____*________|______________________________________________________ ==> LogSoftMax_backwardTHClTensorMathTransformReduce.cl build log: | |
input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier | |
kernel build error: | |
kernel source: | |
1: // from lib/THC/THCTensorMathTransformReduce.cu: | |
2: | |
3: typedef struct Pair { | |
4: float first; | |
5: float second; | |
6: } Pair; | |
7: | |
8: static Pair binary_op( Pair a, Pair b ) { | |
9: if( a.first > b.first ) { return a; } else { return b; }; | |
10: } | |
11: | |
12: /* A set of reduction kernels that take in binary ops on thrust pairs (of value, index). | |
13: These are useful when you not only have to do a reduction, but you might have | |
14: to preserve the location of contention (for example min/max operations). | |
15: The structure of the kernels follows the structure of the reduction kernels. | |
16: */ | |
17: kernel void THClTensor_kernel_transformReduceOuterDimIndex( | |
18: global float *tgt1_data, int tgt1_offset, | |
19: global float *tgt2_data, int tgt2_offset, | |
20: global float *src__data, int src__offset, | |
21: int num_orows, int num_irows, int row_size | |
22: ) { | |
23: global float *tgt1 = tgt1_data + tgt1_offset; | |
24: global float *tgt2 = tgt2_data + tgt2_offset; | |
25: global float *src_ = src__data + src__offset; | |
26: | |
27: for (int orow = get_group_id(0); orow < num_orows; orow += get_num_groups(0)) { | |
28: for (int irow = get_group_id(1) * get_local_size(0) + get_local_id(0); irow < num_irows; irow += get_num_groups(1) * get_local_size(0)) { | |
29: global float *src = src_ + orow * row_size * num_irows + irow; | |
30: Pair acc = {.first=-3.40282e+38f, .second=-1}; | |
31: for (int col = 0; col < row_size; ++col) { | |
32: Pair lhs = {*src, col+1}; | |
33: acc = binary_op( lhs, acc); | |
34: // acc = binary_op(thrust::make_pair(*src, col+1), acc); // i+1 for 1-indexing | |
35: src += num_irows; | |
36: } | |
37: tgt1[orow * num_irows + irow] = acc.first; | |
38: tgt2[orow * num_irows + irow] = acc.second; | |
39: } | |
40: } | |
41: } | |
42: | |
43: /* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index)) | |
44: * | |
45: * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension: | |
46: * | |
47: * - block.x is the innermost dimension, i.e. dimension 0; | |
48: * - block.y and grid.y make up dimension 1; and | |
49: * - grid.x and grid z are the remaining two outer dimensions (if any) | |
50: * | |
51: * Reduction along other dimensions is handled in a separate kernel. | |
52: */ | |
53: kernel void THClTensor_kernel_transformReduceInnermostDimIndex( | |
54: global float *tgt1_data, int tgt1_offset, | |
55: global float *tgt2_data, int tgt2_offset, | |
56: global float *src__data, int src__offset, | |
57: int num_rows, int row_size | |
58: ) { | |
59: global float *tgt1 = tgt1_data + tgt1_offset; | |
60: global float *tgt2 = tgt2_data + tgt2_offset; | |
61: global float *src_ = src__data + src__offset; | |
62: | |
63: local float sbuf[16][16]; | |
64: local float ibuf[16][16]; | |
65: | |
66: for (int block_row = get_group_id(0) * get_local_size(1); block_row < num_rows; block_row += get_local_size(1) * get_num_groups(0)) { | |
67: int row = block_row + get_local_id(1); | |
68: // thrust::pair<float,float> acc = init; | |
69: Pair acc = { .first=-3.40282e+38f, .second=-1 }; | |
70: if (row < num_rows) { | |
71: global float *src = src_ + row * row_size; | |
72: // Sequential reduction within a thread. | |
73: for (int col = get_local_id(0); col < row_size; col += get_local_size(0)) { | |
74: Pair lhs = {src[col], col+1}; | |
75: acc = binary_op(lhs, acc); | |
76: } | |
77: } | |
78: | |
79: sbuf[get_local_id(1)][get_local_id(0)] = acc.first; | |
80: ibuf[get_local_id(1)][get_local_id(0)] = acc.second; | |
81: | |
82: // Reduce intermediate values to single value. | |
83: local float* sline = &sbuf[get_local_id(1)][0]; | |
84: local float* iline = &ibuf[get_local_id(1)][0]; | |
85: for (int s = 8; s > 0; s >>= 1) { | |
86: if (row < num_rows && (int)get_local_id(0) < s) { | |
87: Pair arg1 = {.first=sline[get_local_id(0)], .second=iline[get_local_id(0)]}; | |
88: Pair arg2 = {.first=sline[get_local_id(0) + s], .second=iline[get_local_id(0) + s]}; | |
89: Pair res = binary_op(arg1, arg2); | |
90: sline[get_local_id(0)] = res.first; | |
91: iline[get_local_id(0)] = res.second; | |
92: } | |
93: barrier(CLK_LOCAL_MEM_FENCE); | |
94: } | |
95: | |
96: if (row < num_rows && get_local_id(0) == 0) { | |
97: tgt1[row] = sline[0]; | |
98: tgt2[row] = iline[0]; | |
99: } | |
100: barrier(CLK_LOCAL_MEM_FENCE); | |
101: } | |
102: } | |
103: | |
104: | |
Something went wrong with clCreateKernel, OpenCL erorr code -45 | |
THClTensorMathTransformReduce.cl build log: | |
input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier | |
____*________*|_____________________________________________________ ==> LogSoftMax_backward_batchTHClTensorMathTransformReduce.cl build log: | |
input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier | |
kernel build error: | |
kernel source: | |
1: // from lib/THC/THCTensorMathTransformReduce.cu: | |
2: | |
3: typedef struct Pair { | |
4: float first; | |
5: float second; | |
6: } Pair; | |
7: | |
8: static Pair binary_op( Pair a, Pair b ) { | |
9: if( a.first > b.first ) { return a; } else { return b; }; | |
10: } | |
11: | |
12: /* A set of reduction kernels that take in binary ops on thrust pairs (of value, index). | |
13: These are useful when you not only have to do a reduction, but you might have | |
14: to preserve the location of contention (for example min/max operations). | |
15: The structure of the kernels follows the structure of the reduction kernels. | |
16: */ | |
17: kernel void THClTensor_kernel_transformReduceOuterDimIndex( | |
18: global float *tgt1_data, int tgt1_offset, | |
19: global float *tgt2_data, int tgt2_offset, | |
20: global float *src__data, int src__offset, | |
21: int num_orows, int num_irows, int row_size | |
22: ) { | |
23: global float *tgt1 = tgt1_data + tgt1_offset; | |
24: global float *tgt2 = tgt2_data + tgt2_offset; | |
25: global float *src_ = src__data + src__offset; | |
26: | |
27: for (int orow = get_group_id(0); orow < num_orows; orow += get_num_groups(0)) { | |
28: for (int irow = get_group_id(1) * get_local_size(0) + get_local_id(0); irow < num_irows; irow += get_num_groups(1) * get_local_size(0)) { | |
29: global float *src = src_ + orow * row_size * num_irows + irow; | |
30: Pair acc = {.first=-3.40282e+38f, .second=-1}; | |
31: for (int col = 0; col < row_size; ++col) { | |
32: Pair lhs = {*src, col+1}; | |
33: acc = binary_op( lhs, acc); | |
34: // acc = binary_op(thrust::make_pair(*src, col+1), acc); // i+1 for 1-indexing | |
35: src += num_irows; | |
36: } | |
37: tgt1[orow * num_irows + irow] = acc.first; | |
38: tgt2[orow * num_irows + irow] = acc.second; | |
39: } | |
40: } | |
41: } | |
42: | |
43: /* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index)) | |
44: * | |
45: * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension: | |
46: * | |
47: * - block.x is the innermost dimension, i.e. dimension 0; | |
48: * - block.y and grid.y make up dimension 1; and | |
49: * - grid.x and grid z are the remaining two outer dimensions (if any) | |
50: * | |
51: * Reduction along other dimensions is handled in a separate kernel. | |
52: */ | |
53: kernel void THClTensor_kernel_transformReduceInnermostDimIndex( | |
54: global float *tgt1_data, int tgt1_offset, | |
55: global float *tgt2_data, int tgt2_offset, | |
56: global float *src__data, int src__offset, | |
57: int num_rows, int row_size | |
58: ) { | |
59: global float *tgt1 = tgt1_data + tgt1_offset; | |
60: global float *tgt2 = tgt2_data + tgt2_offset; | |
61: global float *src_ = src__data + src__offset; | |
62: | |
63: local float sbuf[16][16]; | |
64: local float ibuf[16][16]; | |
65: | |
66: for (int block_row = get_group_id(0) * get_local_size(1); block_row < num_rows; block_row += get_local_size(1) * get_num_groups(0)) { | |
67: int row = block_row + get_local_id(1); | |
68: // thrust::pair<float,float> acc = init; | |
69: Pair acc = { .first=-3.40282e+38f, .second=-1 }; | |
70: if (row < num_rows) { | |
71: global float *src = src_ + row * row_size; | |
72: // Sequential reduction within a thread. | |
73: for (int col = get_local_id(0); col < row_size; col += get_local_size(0)) { | |
74: Pair lhs = {src[col], col+1}; | |
75: acc = binary_op(lhs, acc); | |
76: } | |
77: } | |
78: | |
79: sbuf[get_local_id(1)][get_local_id(0)] = acc.first; | |
80: ibuf[get_local_id(1)][get_local_id(0)] = acc.second; | |
81: | |
82: // Reduce intermediate values to single value. | |
83: local float* sline = &sbuf[get_local_id(1)][0]; | |
84: local float* iline = &ibuf[get_local_id(1)][0]; | |
85: for (int s = 8; s > 0; s >>= 1) { | |
86: if (row < num_rows && (int)get_local_id(0) < s) { | |
87: Pair arg1 = {.first=sline[get_local_id(0)], .second=iline[get_local_id(0)]}; | |
88: Pair arg2 = {.first=sline[get_local_id(0) + s], .second=iline[get_local_id(0) + s]}; | |
89: Pair res = binary_op(arg1, arg2); | |
90: sline[get_local_id(0)] = res.first; | |
91: iline[get_local_id(0)] = res.second; | |
92: } | |
93: barrier(CLK_LOCAL_MEM_FENCE); | |
94: } | |
95: | |
96: if (row < num_rows && get_local_id(0) == 0) { | |
97: tgt1[row] = sline[0]; | |
98: tgt2[row] = iline[0]; | |
99: } | |
100: barrier(CLK_LOCAL_MEM_FENCE); | |
101: } | |
102: } | |
103: | |
104: | |
Something went wrong with clCreateKernel, OpenCL erorr code -45 | |
THClTensorMathTransformReduce.cl build log: | |
input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier | |
____*________**|____________________________________________________ ==> LogSoftMax_forwardTHClTensorMathTransformReduce.cl build log: | |
input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier | |
kernel build error: | |
kernel source: | |
1: // from lib/THC/THCTensorMathTransformReduce.cu: | |
2: | |
3: typedef struct Pair { | |
4: float first; | |
5: float second; | |
6: } Pair; | |
7: | |
8: static Pair binary_op( Pair a, Pair b ) { | |
9: if( a.first > b.first ) { return a; } else { return b; }; | |
10: } | |
11: | |
12: /* A set of reduction kernels that take in binary ops on thrust pairs (of value, index). | |
13: These are useful when you not only have to do a reduction, but you might have | |
14: to preserve the location of contention (for example min/max operations). | |
15: The structure of the kernels follows the structure of the reduction kernels. | |
16: */ | |
17: kernel void THClTensor_kernel_transformReduceOuterDimIndex( | |
18: global float *tgt1_data, int tgt1_offset, | |
19: global float *tgt2_data, int tgt2_offset, | |
20: global float *src__data, int src__offset, | |
21: int num_orows, int num_irows, int row_size | |
22: ) { | |
23: global float *tgt1 = tgt1_data + tgt1_offset; | |
24: global float *tgt2 = tgt2_data + tgt2_offset; | |
25: global float *src_ = src__data + src__offset; | |
26: | |
27: for (int orow = get_group_id(0); orow < num_orows; orow += get_num_groups(0)) { | |
28: for (int irow = get_group_id(1) * get_local_size(0) + get_local_id(0); irow < num_irows; irow += get_num_groups(1) * get_local_size(0)) { | |
29: global float *src = src_ + orow * row_size * num_irows + irow; | |
30: Pair acc = {.first=-3.40282e+38f, .second=-1}; | |
31: for (int col = 0; col < row_size; ++col) { | |
32: Pair lhs = {*src, col+1}; | |
33: acc = binary_op( lhs, acc); | |
34: // acc = binary_op(thrust::make_pair(*src, col+1), acc); // i+1 for 1-indexing | |
35: src += num_irows; | |
36: } | |
37: tgt1[orow * num_irows + irow] = acc.first; | |
38: tgt2[orow * num_irows + irow] = acc.second; | |
39: } | |
40: } | |
41: } | |
42: | |
43: /* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index)) | |
44: * | |
45: * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension: | |
46: * | |
47: * - block.x is the innermost dimension, i.e. dimension 0; | |
48: * - block.y and grid.y make up dimension 1; and | |
49: * - grid.x and grid z are the remaining two outer dimensions (if any) | |
50: * | |
51: * Reduction along other dimensions is handled in a separate kernel. | |
52: */ | |
53: kernel void THClTensor_kernel_transformReduceInnermostDimIndex( | |
54: global float *tgt1_data, int tgt1_offset, | |
55: global float *tgt2_data, int tgt2_offset, | |
56: global float *src__data, int src__offset, | |
57: int num_rows, int row_size | |
58: ) { | |
59: global float *tgt1 = tgt1_data + tgt1_offset; | |
60: global float *tgt2 = tgt2_data + tgt2_offset; | |
61: global float *src_ = src__data + src__offset; | |
62: | |
63: local float sbuf[16][16]; | |
64: local float ibuf[16][16]; | |
65: | |
66: for (int block_row = get_group_id(0) * get_local_size(1); block_row < num_rows; block_row += get_local_size(1) * get_num_groups(0)) { | |
67: int row = block_row + get_local_id(1); | |
68: // thrust::pair<float,float> acc = init; | |
69: Pair acc = { .first=-3.40282e+38f, .second=-1 }; | |
70: if (row < num_rows) { | |
71: global float *src = src_ + row * row_size; | |
72: // Sequential reduction within a thread. | |
73: for (int col = get_local_id(0); col < row_size; col += get_local_size(0)) { | |
74: Pair lhs = {src[col], col+1}; | |
75: acc = binary_op(lhs, acc); | |
76: } | |
77: } | |
78: | |
79: sbuf[get_local_id(1)][get_local_id(0)] = acc.first; | |
80: ibuf[get_local_id(1)][get_local_id(0)] = acc.second; | |
81: | |
82: // Reduce intermediate values to single value. | |
83: local float* sline = &sbuf[get_local_id(1)][0]; | |
84: local float* iline = &ibuf[get_local_id(1)][0]; | |
85: for (int s = 8; s > 0; s >>= 1) { | |
86: if (row < num_rows && (int)get_local_id(0) < s) { | |
87: Pair arg1 = {.first=sline[get_local_id(0)], .second=iline[get_local_id(0)]}; | |
88: Pair arg2 = {.first=sline[get_local_id(0) + s], .second=iline[get_local_id(0) + s]}; | |
89: Pair res = binary_op(arg1, arg2); | |
90: sline[get_local_id(0)] = res.first; | |
91: iline[get_local_id(0)] = res.second; | |
92: } | |
93: barrier(CLK_LOCAL_MEM_FENCE); | |
94: } | |
95: | |
96: if (row < num_rows && get_local_id(0) == 0) { | |
97: tgt1[row] = sline[0]; | |
98: tgt2[row] = iline[0]; | |
99: } | |
100: barrier(CLK_LOCAL_MEM_FENCE); | |
101: } | |
102: } | |
103: | |
104: | |
Something went wrong with clCreateKernel, OpenCL erorr code -45 | |
THClTensorMathTransformReduce.cl build log: | |
input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier | |
____*________***|___________________________________________________ ==> LogSoftMax_forward_batchTHClTensorMathTransformReduce.cl build log: | |
input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier | |
kernel build error: | |
kernel source: | |
1: // from lib/THC/THCTensorMathTransformReduce.cu: | |
2: | |
3: typedef struct Pair { | |
4: float first; | |
5: float second; | |
6: } Pair; | |
7: | |
8: static Pair binary_op( Pair a, Pair b ) { | |
9: if( a.first > b.first ) { return a; } else { return b; }; | |
10: } | |
11: | |
12: /* A set of reduction kernels that take in binary ops on thrust pairs (of value, index). | |
13: These are useful when you not only have to do a reduction, but you might have | |
14: to preserve the location of contention (for example min/max operations). | |
15: The structure of the kernels follows the structure of the reduction kernels. | |
16: */ | |
17: kernel void THClTensor_kernel_transformReduceOuterDimIndex( | |
18: global float *tgt1_data, int tgt1_offset, | |
19: global float *tgt2_data, int tgt2_offset, | |
20: global float *src__data, int src__offset, | |
21: int num_orows, int num_irows, int row_size | |
22: ) { | |
23: global float *tgt1 = tgt1_data + tgt1_offset; | |
24: global float *tgt2 = tgt2_data + tgt2_offset; | |
25: global float *src_ = src__data + src__offset; | |
26: | |
27: for (int orow = get_group_id(0); orow < num_orows; orow += get_num_groups(0)) { | |
28: for (int irow = get_group_id(1) * get_local_size(0) + get_local_id(0); irow < num_irows; irow += get_num_groups(1) * get_local_size(0)) { | |
29: global float *src = src_ + orow * row_size * num_irows + irow; | |
30: Pair acc = {.first=-3.40282e+38f, .second=-1}; | |
31: for (int col = 0; col < row_size; ++col) { | |
32: Pair lhs = {*src, col+1}; | |
33: acc = binary_op( lhs, acc); | |
34: // acc = binary_op(thrust::make_pair(*src, col+1), acc); // i+1 for 1-indexing | |
35: src += num_irows; | |
36: } | |
37: tgt1[orow * num_irows + irow] = acc.first; | |
38: tgt2[orow * num_irows + irow] = acc.second; | |
39: } | |
40: } | |
41: } | |
42: | |
43: /* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index)) | |
44: * | |
45: * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension: | |
46: * | |
47: * - block.x is the innermost dimension, i.e. dimension 0; | |
48: * - block.y and grid.y make up dimension 1; and | |
49: * - grid.x and grid z are the remaining two outer dimensions (if any) | |
50: * | |
51: * Reduction along other dimensions is handled in a separate kernel. | |
52: */ | |
53: kernel void THClTensor_kernel_transformReduceInnermostDimIndex( | |
54: global float *tgt1_data, int tgt1_offset, | |
55: global float *tgt2_data, int tgt2_offset, | |
56: global float *src__data, int src__offset, | |
57: int num_rows, int row_size | |
58: ) { | |
59: global float *tgt1 = tgt1_data + tgt1_offset; | |
60: global float *tgt2 = tgt2_data + tgt2_offset; | |
61: global float *src_ = src__data + src__offset; | |
62: | |
63: local float sbuf[16][16]; | |
64: local float ibuf[16][16]; | |
65: | |
66: for (int block_row = get_group_id(0) * get_local_size(1); block_row < num_rows; block_row += get_local_size(1) * get_num_groups(0)) { | |
67: int row = block_row + get_local_id(1); | |
68: // thrust::pair<float,float> acc = init; | |
69: Pair acc = { .first=-3.40282e+38f, .second=-1 }; | |
70: if (row < num_rows) { | |
71: global float *src = src_ + row * row_size; | |
72: // Sequential reduction within a thread. | |
73: for (int col = get_local_id(0); col < row_size; col += get_local_size(0)) { | |
74: Pair lhs = {src[col], col+1}; | |
75: acc = binary_op(lhs, acc); | |
76: } | |
77: } | |
78: | |
79: sbuf[get_local_id(1)][get_local_id(0)] = acc.first; | |
80: ibuf[get_local_id(1)][get_local_id(0)] = acc.second; | |
81: | |
82: // Reduce intermediate values to single value. | |
83: local float* sline = &sbuf[get_local_id(1)][0]; | |
84: local float* iline = &ibuf[get_local_id(1)][0]; | |
85: for (int s = 8; s > 0; s >>= 1) { | |
86: if (row < num_rows && (int)get_local_id(0) < s) { | |
87: Pair arg1 = {.first=sline[get_local_id(0)], .second=iline[get_local_id(0)]}; | |
88: Pair arg2 = {.first=sline[get_local_id(0) + s], .second=iline[get_local_id(0) + s]}; | |
89: Pair res = binary_op(arg1, arg2); | |
90: sline[get_local_id(0)] = res.first; | |
91: iline[get_local_id(0)] = res.second; | |
92: } | |
93: barrier(CLK_LOCAL_MEM_FENCE); | |
94: } | |
95: | |
96: if (row < num_rows && get_local_id(0) == 0) { | |
97: tgt1[row] = sline[0]; | |
98: tgt2[row] = iline[0]; | |
99: } | |
100: barrier(CLK_LOCAL_MEM_FENCE); | |
101: } | |
102: } | |
103: | |
104: | |
Something went wrong with clCreateKernel, OpenCL erorr code -45 | |
THClTensorMathTransformReduce.cl build log: | |
input.cl:8:1: error: OpenCL does not support the 'static' storage class specifier | |
____*________****|__________________________________________________ ==> LookupTable_backwardnDim 97 nInput 10 batch false error 0 | |
nDim 97 nInput 10 batch true error 0 | |
nDim 97 nInput 101 batch false error 0 | |
nDim 97 nInput 101 batch true error 0 | |
nDim 255 nInput 10 batch false error 0 | |
nDim 255 nInput 10 batch true error 0 | |
nDim 255 nInput 101 batch false error 0 | |
nDim 255 nInput 101 batch true error 0 | |
____*________****______________________________________|____________ ==> Sum_backwardTHClReduce.cl build log: | |
unsupported call to function reduceBlock in THClTensor_reduceContigDim | |
kernel build error: | |
kernel source: | |
1: // Threads per thread block | |
2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16 | |
3: | |
4: inline float modifyOp(float _in1) { | |
5: float _out; | |
6: float *in1 = &_in1; | |
7: float *out = &_out; | |
8: *out = *in1; | |
9: return _out; | |
10: } | |
11: | |
12: inline float reduceOp(float _in1, float _in2) { | |
13: // I guess the compiler can sort this stuff out :-P | |
14: float _out; | |
15: float *in1 = &_in1; | |
16: float *in2 = &_in2; | |
17: float *out = &_out; | |
18: *out = *in1 + *in2; | |
19: return _out; | |
20: } | |
21: | |
22: // kernel argument that defines tensor layout | |
23: typedef struct TensorInfoCl { | |
24: // Extracts size/stride information for the kernel. | |
25: // Successive dimensions can be collapsed if the size/strides match | |
26: // up and thus there are no holes between the dimensions. This is used | |
27: // to reduce the complexity of the problem. | |
28: // The optional `reduceDim` indicates a reduction dimension for the | |
29: // given tensor, so that the output size for this dimension will be 1. | |
30: | |
31: int sizes[25]; | |
32: int strides[25]; | |
33: int offset; | |
34: int dims; | |
35: } TensorInfoCl; | |
36: // Contiguous tensors of more than one dimension are collapsed down | |
37: // to one tensor | |
38: | |
39: | |
40: // Translate a linear index for the apply to a float* offset; | |
41: // specialized on `Dims` to reduce nvcc compilation time | |
42: | |
43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) { | |
44: int offset = info->offset; | |
45: | |
46: // Use static dims | |
47: // for (int i = 1 - 1; i >= 0; --i) { | |
48: int curDimIndex; | |
49: int curDimOffset; | |
50: // bake this in.... | |
51: curDimIndex = linearId % info->sizes[0]; | |
52: curDimOffset = curDimIndex * info->strides[0]; | |
53: offset += curDimOffset; | |
54: | |
55: | |
56: | |
57: // } | |
58: | |
59: return offset; | |
60: } | |
61: | |
62: | |
63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) { | |
64: return linearId + info->offset; | |
65: } | |
66: | |
67: inline int IndexToOffset_999_get(int linearId, global const TensorInfoCl *info) { | |
68: int offset = info->offset; | |
69: | |
70: // Use dynamic dims | |
71: for (int i = info->dims - 1; i >= 0; --i) { | |
72: int curDimIndex = linearId % info->sizes[i]; | |
73: int curDimOffset = curDimIndex * info->strides[i]; | |
74: offset += curDimOffset; | |
75: | |
76: linearId /= info->sizes[i]; | |
77: } | |
78: | |
79: return offset; | |
80: } | |
81: | |
82: inline int getLinearBlockId() { | |
83: return get_group_id(2) * get_num_groups(1) * get_num_groups(0) + | |
84: get_group_id(1) * get_num_groups(0) + | |
85: get_group_id(0); | |
86: } | |
87: | |
88: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will | |
89: // return the reduced value | |
90: | |
91: inline float reduceBlock( local float* smem, | |
92: int numVals, | |
93: float threadVal, | |
94: float init) { | |
95: if (numVals == 0) { | |
96: return init; | |
97: } | |
98: | |
99: if ((int)get_local_id(0) < numVals) { | |
100: smem[ get_local_id(0)] = threadVal; | |
101: } | |
102: | |
103: // First warp will perform reductions across warps | |
104: barrier(CLK_LOCAL_MEM_FENCE); | |
105: if ((get_local_id(0) / 32) == 0) { | |
106: float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init; | |
107: | |
108: for (int i = 32 + get_local_id(0); i < numVals; i += 32) { | |
109: r = reduceOp(r, smem[i]); | |
110: } | |
111: | |
112: smem[get_local_id(0)] = r; | |
113: } | |
114: | |
115: // First thread will perform reductions across the block | |
116: barrier(CLK_LOCAL_MEM_FENCE); | |
117: | |
118: float r = init; | |
119: if (get_local_id(0) == 0) { | |
120: r = smem[0]; | |
121: | |
122: int numLanesParticipating = min(numVals, 32); | |
123: | |
124: if (numLanesParticipating == 32) { | |
125: // Unroll for 32 == 32 and numVals >= 32 | |
126: // #pragma unroll | |
127: // unrolling by hand, so compiler-independent | |
128: | |
129: r = reduceOp(r, smem[1]); | |
130: | |
131: r = reduceOp(r, smem[2]); | |
132: | |
133: r = reduceOp(r, smem[3]); | |
134: | |
135: r = reduceOp(r, smem[4]); | |
136: | |
137: r = reduceOp(r, smem[5]); | |
138: | |
139: r = reduceOp(r, smem[6]); | |
140: | |
141: r = reduceOp(r, smem[7]); | |
142: | |
143: r = reduceOp(r, smem[8]); | |
144: | |
145: r = reduceOp(r, smem[9]); | |
146: | |
147: r = reduceOp(r, smem[10]); | |
148: | |
149: r = reduceOp(r, smem[11]); | |
150: | |
151: r = reduceOp(r, smem[12]); | |
152: | |
153: r = reduceOp(r, smem[13]); | |
154: | |
155: r = reduceOp(r, smem[14]); | |
156: | |
157: r = reduceOp(r, smem[15]); | |
158: | |
159: r = reduceOp(r, smem[16]); | |
160: | |
161: r = reduceOp(r, smem[17]); | |
162: | |
163: r = reduceOp(r, smem[18]); | |
164: | |
165: r = reduceOp(r, smem[19]); | |
166: | |
167: r = reduceOp(r, smem[20]); | |
168: | |
169: r = reduceOp(r, smem[21]); | |
170: | |
171: r = reduceOp(r, smem[22]); | |
172: | |
173: r = reduceOp(r, smem[23]); | |
174: | |
175: r = reduceOp(r, smem[24]); | |
176: | |
177: r = reduceOp(r, smem[25]); | |
178: | |
179: r = reduceOp(r, smem[26]); | |
180: | |
181: r = reduceOp(r, smem[27]); | |
182: | |
183: r = reduceOp(r, smem[28]); | |
184: | |
185: r = reduceOp(r, smem[29]); | |
186: | |
187: r = reduceOp(r, smem[30]); | |
188: | |
189: r = reduceOp(r, smem[31]); | |
190: | |
191: } else { | |
192: for (int i = 1; i < numLanesParticipating; ++i) { | |
193: r = reduceOp(r, smem[i]); | |
194: } | |
195: } | |
196: } | |
197: | |
198: return r; | |
199: } | |
200: | |
201: | |
202: | |
203: | |
204: inline int getReduceNoncontigDimSliceIndex() { | |
205: // Each thread handles one slice | |
206: return getLinearBlockId() * THCL_NONCONTIG_REDUCE_BLOCK_SIZE + /*threadIdx.x*/ get_local_id(0); | |
207: } | |
208: | |
209: // Kernel that handles an entire reduction of a slice of a tensor per each thread | |
210: kernel void | |
211: THClTensor_reduceNoncontigDim(global TensorInfoCl *out_info, | |
212: global float *out_data, | |
213: global TensorInfoCl *in_info, | |
214: global float *in_data, | |
215: int reductionStride, | |
216: int reductionSize, | |
217: int totalSlices, | |
218: float init) { | |
219: const int sliceIndex = getReduceNoncontigDimSliceIndex(); | |
220: | |
221: if ((int)sliceIndex >= totalSlices) { | |
222: return; | |
223: } | |
224: | |
225: // Each thread picks a point in `out` and `in` for which it is | |
226: // producing the reduction | |
227: const int outOffset = | |
228: IndexToOffset_998_get(sliceIndex, &out_info[0]); | |
229: const int inBaseOffset = | |
230: IndexToOffset_1001_get(sliceIndex, &in_info[0]); | |
231: | |
232: // For each point in reductionSize, reduce into `r` | |
233: int inOffset = inBaseOffset; | |
234: float r = init; | |
235: | |
236: for (int i = 0; (int)i < reductionSize; ++i) { | |
237: r = reduceOp(r, modifyOp(in_data[inOffset])); | |
238: inOffset += reductionStride; | |
239: } | |
240: | |
241: // Write out reduced value | |
242: out_data[outOffset] = r; | |
243: } | |
244: | |
245: inline int getReduceContigDimSliceIndex() { | |
246: // Each block handles one slice | |
247: return getLinearBlockId(); | |
248: } | |
249: | |
250: // Kernel that handles an entire reduction of a slice of a tensor per | |
251: // each block | |
252: kernel void | |
253: THClTensor_reduceContigDim(global TensorInfoCl *out_info, | |
254: global float *out_data, | |
255: global TensorInfoCl *in_info, | |
256: global float *in_data, | |
257: int reductionSize, | |
258: int totalSlices, | |
259: float init, | |
260: local float *smem) { | |
261: const int sliceIndex = getReduceContigDimSliceIndex(); | |
262: | |
263: if ((int)sliceIndex >= totalSlices) { | |
264: return; | |
265: } | |
266: | |
267: // Get the offset in `out` for the reduction | |
268: const int outOffset = | |
269: IndexToOffset_998_get(sliceIndex, &out_info[0]); | |
270: | |
271: // Get the base offset in `in` for this block's reduction | |
272: const int inBaseOffset = | |
273: IndexToOffset_1001_get(sliceIndex, &in_info[0]); | |
274: | |
275: // Each thread in the block will reduce some subset of elements in | |
276: // the slice. The elements are guaranteed contiguous starting at | |
277: // `inBaseOffset`. | |
278: float r = init; | |
279: for (int i = /*threadIdx.x*/ get_local_id(0); (int)i < reductionSize; i += /*blockDim.x*/ get_local_size(0)) { | |
280: r = reduceOp(r, modifyOp(in_data[inBaseOffset + i])); | |
281: } | |
282: | |
283: // Reduce within the block | |
284: // extern __shared__ float smem[]; | |
285: r = reduceBlock(smem, /*blockDim.x*/ get_local_size(0), r, init); | |
286: | |
287: if (/*threadIdx.x*/ get_local_id(0) == 0) { | |
288: // Write out reduced value | |
289: out_data[outOffset] = r; | |
290: } | |
291: } | |
292: | |
293: | |
Something went wrong with clCreateKernel, OpenCL erorr code -45 | |
THClReduce.cl build log: | |
unsupported call to function reduceBlock in THClTensor_reduceContigDim | |
____*________****______________________________________*|___________ ==> Sum_forwardTHClReduce.cl build log: | |
unsupported call to function reduceBlock in THClTensor_reduceContigDim | |
kernel build error: | |
kernel source: | |
1: // Threads per thread block | |
2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16 | |
3: | |
4: inline float modifyOp(float _in1) { | |
5: float _out; | |
6: float *in1 = &_in1; | |
7: float *out = &_out; | |
8: *out = *in1; | |
9: return _out; | |
10: } | |
11: | |
12: inline float reduceOp(float _in1, float _in2) { | |
13: // I guess the compiler can sort this stuff out :-P | |
14: float _out; | |
15: float *in1 = &_in1; | |
16: float *in2 = &_in2; | |
17: float *out = &_out; | |
18: *out = *in1 + *in2; | |
19: return _out; | |
20: } | |
21: | |
22: // kernel argument that defines tensor layout | |
23: typedef struct TensorInfoCl { | |
24: // Extracts size/stride information for the kernel. | |
25: // Successive dimensions can be collapsed if the size/strides match | |
26: // up and thus there are no holes between the dimensions. This is used | |
27: // to reduce the complexity of the problem. | |
28: // The optional `reduceDim` indicates a reduction dimension for the | |
29: // given tensor, so that the output size for this dimension will be 1. | |
30: | |
31: int sizes[25]; | |
32: int strides[25]; | |
33: int offset; | |
34: int dims; | |
35: } TensorInfoCl; | |
36: // Contiguous tensors of more than one dimension are collapsed down | |
37: // to one tensor | |
38: | |
39: | |
40: // Translate a linear index for the apply to a float* offset; | |
41: // specialized on `Dims` to reduce nvcc compilation time | |
42: | |
43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) { | |
44: int offset = info->offset; | |
45: | |
46: // Use static dims | |
47: // for (int i = 1 - 1; i >= 0; --i) { | |
48: int curDimIndex; | |
49: int curDimOffset; | |
50: // bake this in.... | |
51: curDimIndex = linearId % info->sizes[0]; | |
52: curDimOffset = curDimIndex * info->strides[0]; | |
53: offset += curDimOffset; | |
54: | |
55: | |
56: | |
57: // } | |
58: | |
59: return offset; | |
60: } | |
61: | |
62: | |
63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) { | |
64: return linearId + info->offset; | |
65: } | |
66: | |
67: inline int IndexToOffset_999_get(int linearId, global const TensorInfoCl *info) { | |
68: int offset = info->offset; | |
69: | |
70: // Use dynamic dims | |
71: for (int i = info->dims - 1; i >= 0; --i) { | |
72: int curDimIndex = linearId % info->sizes[i]; | |
73: int curDimOffset = curDimIndex * info->strides[i]; | |
74: offset += curDimOffset; | |
75: | |
76: linearId /= info->sizes[i]; | |
77: } | |
78: | |
79: return offset; | |
80: } | |
81: | |
82: inline int getLinearBlockId() { | |
83: return get_group_id(2) * get_num_groups(1) * get_num_groups(0) + | |
84: get_group_id(1) * get_num_groups(0) + | |
85: get_group_id(0); | |
86: } | |
87: | |
88: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will | |
89: // return the reduced value | |
90: | |
91: inline float reduceBlock( local float* smem, | |
92: int numVals, | |
93: float threadVal, | |
94: float init) { | |
95: if (numVals == 0) { | |
96: return init; | |
97: } | |
98: | |
99: if ((int)get_local_id(0) < numVals) { | |
100: smem[ get_local_id(0)] = threadVal; | |
101: } | |
102: | |
103: // First warp will perform reductions across warps | |
104: barrier(CLK_LOCAL_MEM_FENCE); | |
105: if ((get_local_id(0) / 32) == 0) { | |
106: float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init; | |
107: | |
108: for (int i = 32 + get_local_id(0); i < numVals; i += 32) { | |
109: r = reduceOp(r, smem[i]); | |
110: } | |
111: | |
112: smem[get_local_id(0)] = r; | |
113: } | |
114: | |
115: // First thread will perform reductions across the block | |
116: barrier(CLK_LOCAL_MEM_FENCE); | |
117: | |
118: float r = init; | |
119: if (get_local_id(0) == 0) { | |
120: r = smem[0]; | |
121: | |
122: int numLanesParticipating = min(numVals, 32); | |
123: | |
124: if (numLanesParticipating == 32) { | |
125: // Unroll for 32 == 32 and numVals >= 32 | |
126: // #pragma unroll | |
127: // unrolling by hand, so compiler-independent | |
128: | |
129: r = reduceOp(r, smem[1]); | |
130: | |
131: r = reduceOp(r, smem[2]); | |
132: | |
133: r = reduceOp(r, smem[3]); | |
134: | |
135: r = reduceOp(r, smem[4]); | |
136: | |
137: r = reduceOp(r, smem[5]); | |
138: | |
139: r = reduceOp(r, smem[6]); | |
140: | |
141: r = reduceOp(r, smem[7]); | |
142: | |
143: r = reduceOp(r, smem[8]); | |
144: | |
145: r = reduceOp(r, smem[9]); | |
146: | |
147: r = reduceOp(r, smem[10]); | |
148: | |
149: r = reduceOp(r, smem[11]); | |
150: | |
151: r = reduceOp(r, smem[12]); | |
152: | |
153: r = reduceOp(r, smem[13]); | |
154: | |
155: r = reduceOp(r, smem[14]); | |
156: | |
157: r = reduceOp(r, smem[15]); | |
158: | |
159: r = reduceOp(r, smem[16]); | |
160: | |
161: r = reduceOp(r, smem[17]); | |
162: | |
163: r = reduceOp(r, smem[18]); | |
164: | |
165: r = reduceOp(r, smem[19]); | |
166: | |
167: r = reduceOp(r, smem[20]); | |
168: | |
169: r = reduceOp(r, smem[21]); | |
170: | |
171: r = reduceOp(r, smem[22]); | |
172: | |
173: r = reduceOp(r, smem[23]); | |
174: | |
175: r = reduceOp(r, smem[24]); | |
176: | |
177: r = reduceOp(r, smem[25]); | |
178: | |
179: r = reduceOp(r, smem[26]); | |
180: | |
181: r = reduceOp(r, smem[27]); | |
182: | |
183: r = reduceOp(r, smem[28]); | |
184: | |
185: r = reduceOp(r, smem[29]); | |
186: | |
187: r = reduceOp(r, smem[30]); | |
188: | |
189: r = reduceOp(r, smem[31]); | |
190: | |
191: } else { | |
192: for (int i = 1; i < numLanesParticipating; ++i) { | |
193: r = reduceOp(r, smem[i]); | |
194: } | |
195: } | |
196: } | |
197: | |
198: return r; | |
199: } | |
200: | |
201: | |
202: | |
203: | |
204: inline int getReduceNoncontigDimSliceIndex() { | |
205: // Each thread handles one slice | |
206: return getLinearBlockId() * THCL_NONCONTIG_REDUCE_BLOCK_SIZE + /*threadIdx.x*/ get_local_id(0); | |
207: } | |
208: | |
209: // Kernel that handles an entire reduction of a slice of a tensor per each thread | |
210: kernel void | |
211: THClTensor_reduceNoncontigDim(global TensorInfoCl *out_info, | |
212: global float *out_data, | |
213: global TensorInfoCl *in_info, | |
214: global float *in_data, | |
215: int reductionStride, | |
216: int reductionSize, | |
217: int totalSlices, | |
218: float init) { | |
219: const int sliceIndex = getReduceNoncontigDimSliceIndex(); | |
220: | |
221: if ((int)sliceIndex >= totalSlices) { | |
222: return; | |
223: } | |
224: | |
225: // Each thread picks a point in `out` and `in` for which it is | |
226: // producing the reduction | |
227: const int outOffset = | |
228: IndexToOffset_998_get(sliceIndex, &out_info[0]); | |
229: const int inBaseOffset = | |
230: IndexToOffset_1001_get(sliceIndex, &in_info[0]); | |
231: | |
232: // For each point in reductionSize, reduce into `r` | |
233: int inOffset = inBaseOffset; | |
234: float r = init; | |
235: | |
236: for (int i = 0; (int)i < reductionSize; ++i) { | |
237: r = reduceOp(r, modifyOp(in_data[inOffset])); | |
238: inOffset += reductionStride; | |
239: } | |
240: | |
241: // Write out reduced value | |
242: out_data[outOffset] = r; | |
243: } | |
244: | |
245: inline int getReduceContigDimSliceIndex() { | |
246: // Each block handles one slice | |
247: return getLinearBlockId(); | |
248: } | |
249: | |
250: // Kernel that handles an entire reduction of a slice of a tensor per | |
251: // each block | |
252: kernel void | |
253: THClTensor_reduceContigDim(global TensorInfoCl *out_info, | |
254: global float *out_data, | |
255: global TensorInfoCl *in_info, | |
256: global float *in_data, | |
257: int reductionSize, | |
258: int totalSlices, | |
259: float init, | |
260: local float *smem) { | |
261: const int sliceIndex = getReduceContigDimSliceIndex(); | |
262: | |
263: if ((int)sliceIndex >= totalSlices) { | |
264: return; | |
265: } | |
266: | |
267: // Get the offset in `out` for the reduction | |
268: const int outOffset = | |
269: IndexToOffset_998_get(sliceIndex, &out_info[0]); | |
270: | |
271: // Get the base offset in `in` for this block's reduction | |
272: const int inBaseOffset = | |
273: IndexToOffset_1001_get(sliceIndex, &in_info[0]); | |
274: | |
275: // Each thread in the block will reduce some subset of elements in | |
276: // the slice. The elements are guaranteed contiguous starting at | |
277: // `inBaseOffset`. | |
278: float r = init; | |
279: for (int i = /*threadIdx.x*/ get_local_id(0); (int)i < reductionSize; i += /*blockDim.x*/ get_local_size(0)) { | |
280: r = reduceOp(r, modifyOp(in_data[inBaseOffset + i])); | |
281: } | |
282: | |
283: // Reduce within the block | |
284: // extern __shared__ float smem[]; | |
285: r = reduceBlock(smem, /*blockDim.x*/ get_local_size(0), r, init); | |
286: | |
287: if (/*threadIdx.x*/ get_local_id(0) == 0) { | |
288: // Write out reduced value | |
289: out_data[outOffset] = r; | |
290: } | |
291: } | |
292: | |
293: | |
Something went wrong with clCreateKernel, OpenCL erorr code -45 | |
THClReduce.cl build log: | |
unsupported call to function reduceBlock in THClTensor_reduceContigDim | |
____*________****______________________________________**|__________ ==> Tanh_backwardApply_2t_0s_0pt_-2_-2_*out = tanh( *in1 ) build log: | |
input.cl:35:12: warning: implicit declaration of function 'tanh' is invalid in C99 | |
unsupported call to function tanh in THClTensor_pointwiseApplyD | |
kernel build error: | |
kernel source: | |
1: // OpenCL kernels.... | |
2: | |
3: // expected templated values: | |
4: // dims (vector of unique dimension values) | |
5: // operation | |
6: // dim1 | |
7: // dim2 | |
8: // dim3 | |
9: // ... dimD | |
10: // num_input_tensors | |
11: // include_scalar_input | |
12: // | |
13: // maybe should add: | |
14: // IndexType (hardcoded to int for now) | |
15: // MAX_CUTORCH_DIMS (hardcoded to 25 for now) | |
16: | |
17: // (Ported from cutorch's THCApply.cuh) | |
18: | |
19: // Maximum number of dimensions allowed for cutorch | |
20: // #define MAX_CUTORCH_DIMS 25 | |
21: | |
22: // Enum that indicates whether tensor arguments are read/write or | |
23: // read-only | |
24: //enum TensorArgType { ReadWrite, ReadOnly }; | |
25: | |
26: | |
27: | |
28: inline void op( global float *out | |
29: | |
30: , global float *in1 | |
31: | |
32: | |
33: | |
34: ) { | |
35: *out = tanh( *in1 ); | |
36: } | |
37: | |
38: kernel void | |
39: THClTensor_pointwiseApplyD( | |
40: | |
41: int offset_1, | |
42: | |
43: | |
44: global float*data_1, | |
45: | |
46: int offset_2, | |
47: | |
48: | |
49: global float*data_2, | |
50: | |
51: | |
52: | |
53: int totalElements) { | |
54: int linearIndex = get_global_id(0); | |
55: if(linearIndex < totalElements ) { | |
56: | |
57: | |
58: | |
59: | |
60: int derived_offset_1 = linearIndex + offset_1; | |
61: | |
62: | |
63: | |
64: | |
65: int derived_offset_2 = linearIndex + offset_2; | |
66: | |
67: | |
68: | |
69: op( | |
70: | |
71: | |
72: &(data_1[derived_offset_1]) | |
73: | |
74: , | |
75: &(data_2[derived_offset_2]) | |
76: | |
77: | |
78: | |
79: | |
80: | |
81: ); | |
82: } | |
83: } | |
84: | |
85: | |
Something went wrong with clCreateKernel, OpenCL erorr code -45 | |
Apply_2t_0s_0pt_-2_-2_*out = tanh( *in1 ) build log: | |
input.cl:35:12: warning: implicit declaration of function 'tanh' is invalid in C99 | |
unsupported call to function tanh in THClTensor_pointwiseApplyD | |
____*________****______________________________________***|_________ ==> Tanh_forwardApply_2t_0s_0pt_-2_-2_*out = tanh( *in1 ) build log: | |
input.cl:35:12: warning: implicit declaration of function 'tanh' is invalid in C99 | |
unsupported call to function tanh in THClTensor_pointwiseApplyD | |
kernel build error: | |
kernel source: | |
1: // OpenCL kernels.... | |
2: | |
3: // expected templated values: | |
4: // dims (vector of unique dimension values) | |
5: // operation | |
6: // dim1 | |
7: // dim2 | |
8: // dim3 | |
9: // ... dimD | |
10: // num_input_tensors | |
11: // include_scalar_input | |
12: // | |
13: // maybe should add: | |
14: // IndexType (hardcoded to int for now) | |
15: // MAX_CUTORCH_DIMS (hardcoded to 25 for now) | |
16: | |
17: // (Ported from cutorch's THCApply.cuh) | |
18: | |
19: // Maximum number of dimensions allowed for cutorch | |
20: // #define MAX_CUTORCH_DIMS 25 | |
21: | |
22: // Enum that indicates whether tensor arguments are read/write or | |
23: // read-only | |
24: //enum TensorArgType { ReadWrite, ReadOnly }; | |
25: | |
26: | |
27: | |
28: inline void op( global float *out | |
29: | |
30: , global float *in1 | |
31: | |
32: | |
33: | |
34: ) { | |
35: *out = tanh( *in1 ); | |
36: } | |
37: | |
38: kernel void | |
39: THClTensor_pointwiseApplyD( | |
40: | |
41: int offset_1, | |
42: | |
43: | |
44: global float*data_1, | |
45: | |
46: int offset_2, | |
47: | |
48: | |
49: global float*data_2, | |
50: | |
51: | |
52: | |
53: int totalElements) { | |
54: int linearIndex = get_global_id(0); | |
55: if(linearIndex < totalElements ) { | |
56: | |
57: | |
58: | |
59: | |
60: int derived_offset_1 = linearIndex + offset_1; | |
61: | |
62: | |
63: | |
64: | |
65: int derived_offset_2 = linearIndex + offset_2; | |
66: | |
67: | |
68: | |
69: op( | |
70: | |
71: | |
72: &(data_1[derived_offset_1]) | |
73: | |
74: , | |
75: &(data_2[derived_offset_2]) | |
76: | |
77: | |
78: | |
79: | |
80: | |
81: ); | |
82: } | |
83: } | |
84: | |
85: | |
Something went wrong with clCreateKernel, OpenCL erorr code -45 | |
Apply_2t_0s_0pt_-2_-2_*out = tanh( *in1 ) build log: | |
input.cl:35:12: warning: implicit declaration of function 'tanh' is invalid in C99 | |
unsupported call to function tanh in THClTensor_pointwiseApplyD | |
____*________****______________________________________****|________ ==> Tanh_transposedApply_2t_0s_0pt_-2_2_*out = tanh( *in1 ) build log: | |
input.cl:35:12: warning: implicit declaration of function 'tanh' is invalid in C99 | |
unsupported call to function tanh in THClTensor_pointwiseApplyD | |
kernel build error: | |
kernel source: | |
1: // OpenCL kernels.... | |
2: | |
3: // expected templated values: | |
4: // dims (vector of unique dimension values) | |
5: // operation | |
6: // dim1 | |
7: // dim2 | |
8: // dim3 | |
9: // ... dimD | |
10: // num_input_tensors | |
11: // include_scalar_input | |
12: // | |
13: // maybe should add: | |
14: // IndexType (hardcoded to int for now) | |
15: // MAX_CUTORCH_DIMS (hardcoded to 25 for now) | |
16: | |
17: // (Ported from cutorch's THCApply.cuh) | |
18: | |
19: // Maximum number of dimensions allowed for cutorch | |
20: // #define MAX_CUTORCH_DIMS 25 | |
21: | |
22: // Enum that indicates whether tensor arguments are read/write or | |
23: // read-only | |
24: //enum TensorArgType { ReadWrite, ReadOnly }; | |
25: | |
26: | |
27: | |
28: inline void op( global float *out | |
29: | |
30: , global float *in1 | |
31: | |
32: | |
33: | |
34: ) { | |
35: *out = tanh( *in1 ); | |
36: } | |
37: | |
38: kernel void | |
39: THClTensor_pointwiseApplyD( | |
40: | |
41: int offset_1, | |
42: | |
43: | |
44: global float*data_1, | |
45: | |
46: int offset_2, | |
47: | |
48: | |
49: int size_2_1, | |
50: int stride_2_1, | |
51: | |
52: int size_2_2, | |
53: int stride_2_2, | |
54: | |
55: global float*data_2, | |
56: | |
57: | |
58: | |
59: int totalElements) { | |
60: int linearIndex = get_global_id(0); | |
61: if(linearIndex < totalElements ) { | |
62: | |
63: int thisLinearId; | |
64: | |
65: | |
66: | |
67: | |
68: int derived_offset_1 = linearIndex + offset_1; | |
69: | |
70: | |
71: | |
72: | |
73: unsigned int derived_offset_2 = offset_2; | |
74: thisLinearId = linearIndex; | |
75: // bake this in.... | |
76: derived_offset_2 += (thisLinearId % size_2_2) * stride_2_2; | |
77: | |
78: thisLinearId /= size_2_2; | |
79: | |
80: // bake this in.... | |
81: derived_offset_2 += (thisLinearId % size_2_1) * stride_2_1; | |
82: | |
83: thisLinearId /= size_2_1; | |
84: | |
85: | |
86: | |
87: | |
88: | |
89: | |
90: op( | |
91: | |
92: | |
93: &(data_1[derived_offset_1]) | |
94: | |
95: , | |
96: &(data_2[derived_offset_2]) | |
97: | |
98: | |
99: | |
100: | |
101: | |
102: ); | |
103: } | |
104: } | |
105: | |
106: | |
Something went wrong with clCreateKernel, OpenCL erorr code -45 | |
Apply_2t_0s_0pt_-2_2_*out = tanh( *in1 ) build log: | |
input.cl:35:12: warning: implicit declaration of function 'tanh' is invalid in C99 | |
unsupported call to function tanh in THClTensor_pointwiseApplyD | |
____*________****______________________________________*****_____|__ ==> mseTHClReduceAll.cl build log: | |
unsupported call to function reduceBlock in THClTensor_reduceAll | |
kernel build error: | |
kernel source: | |
1: inline unsigned int THClCeilDiv(unsigned int a, unsigned int b) { | |
2: return (a + b - 1) / b; | |
3: } | |
4: | |
5: | |
6: | |
7: inline float modifyOp(float _in1) { | |
8: float _out; | |
9: float *in1 = &_in1; | |
10: float *out = &_out; | |
11: *out = *in1; | |
12: return _out; | |
13: } | |
14: | |
15: inline float reduceOp(float _in1, float _in2) { | |
16: // I guess the compiler can sort this stuff out :-P | |
17: float _out; | |
18: float *in1 = &_in1; | |
19: float *in2 = &_in2; | |
20: float *out = &_out; | |
21: *out = *in1 + *in2; | |
22: return _out; | |
23: } | |
24: | |
25: // kernel argument that defines tensor layout | |
26: typedef struct TensorInfoCl { | |
27: // Extracts size/stride information for the kernel. | |
28: // Successive dimensions can be collapsed if the size/strides match | |
29: // up and thus there are no holes between the dimensions. This is used | |
30: // to reduce the complexity of the problem. | |
31: // The optional `reduceDim` indicates a reduction dimension for the | |
32: // given tensor, so that the output size for this dimension will be 1. | |
33: | |
34: unsigned int sizes[25]; | |
35: unsigned int strides[25]; | |
36: unsigned int offset; | |
37: int dims; | |
38: } TensorInfoCl; | |
39: // Contiguous tensors of more than one dimension are collapsed down | |
40: // to one tensor | |
41: | |
42: | |
43: // Translate a linear index for the apply to a float* offset; | |
44: // specialized on `Dims` to reduce nvcc compilation time | |
45: | |
46: | |
47: inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) { | |
48: return linearId + info->offset; | |
49: } | |
50: | |
51: inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) { | |
52: unsigned int offset = info->offset; | |
53: | |
54: // Use dynamic dims | |
55: for (int i = info->dims - 1; i >= 0; --i) { | |
56: unsigned int curDimIndex = linearId % info->sizes[i]; | |
57: unsigned int curDimOffset = curDimIndex * info->strides[i]; | |
58: offset += curDimOffset; | |
59: | |
60: linearId /= info->sizes[i]; | |
61: } | |
62: | |
63: return offset; | |
64: } | |
65: | |
66: inline unsigned int getLinearBlockId() { | |
67: return get_group_id(2) * get_num_groups(1) * get_num_groups(0) + | |
68: get_group_id(1) * get_num_groups(0) + | |
69: get_group_id(0); | |
70: } | |
71: | |
72: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will | |
73: // return the reduced value | |
74: | |
75: inline float reduceBlock( local float* smem, | |
76: int numVals, | |
77: float threadVal, | |
78: float init) { | |
79: if (numVals == 0) { | |
80: return init; | |
81: } | |
82: | |
83: if ((int)get_local_id(0) < numVals) { | |
84: smem[ get_local_id(0)] = threadVal; | |
85: } | |
86: | |
87: // First warp will perform reductions across warps | |
88: barrier(CLK_LOCAL_MEM_FENCE); | |
89: if ((get_local_id(0) / 32) == 0) { | |
90: float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init; | |
91: | |
92: for (int i = 32 + get_local_id(0); i < numVals; i += 32) { | |
93: r = reduceOp(r, smem[i]); | |
94: } | |
95: | |
96: smem[get_local_id(0)] = r; | |
97: } | |
98: | |
99: // First thread will perform reductions across the block | |
100: barrier(CLK_LOCAL_MEM_FENCE); | |
101: | |
102: float r = init; | |
103: if (get_local_id(0) == 0) { | |
104: r = smem[0]; | |
105: | |
106: int numLanesParticipating = min(numVals, 32); | |
107: | |
108: if (numLanesParticipating == 32) { | |
109: // Unroll for 32 == 32 and numVals >= 32 | |
110: // #pragma unroll | |
111: // unrolling by hand, so compiler-independent | |
112: | |
113: r = reduceOp(r, smem[1]); | |
114: | |
115: r = reduceOp(r, smem[2]); | |
116: | |
117: r = reduceOp(r, smem[3]); | |
118: | |
119: r = reduceOp(r, smem[4]); | |
120: | |
121: r = reduceOp(r, smem[5]); | |
122: | |
123: r = reduceOp(r, smem[6]); | |
124: | |
125: r = reduceOp(r, smem[7]); | |
126: | |
127: r = reduceOp(r, smem[8]); | |
128: | |
129: r = reduceOp(r, smem[9]); | |
130: | |
131: r = reduceOp(r, smem[10]); | |
132: | |
133: r = reduceOp(r, smem[11]); | |
134: | |
135: r = reduceOp(r, smem[12]); | |
136: | |
137: r = reduceOp(r, smem[13]); | |
138: | |
139: r = reduceOp(r, smem[14]); | |
140: | |
141: r = reduceOp(r, smem[15]); | |
142: | |
143: r = reduceOp(r, smem[16]); | |
144: | |
145: r = reduceOp(r, smem[17]); | |
146: | |
147: r = reduceOp(r, smem[18]); | |
148: | |
149: r = reduceOp(r, smem[19]); | |
150: | |
151: r = reduceOp(r, smem[20]); | |
152: | |
153: r = reduceOp(r, smem[21]); | |
154: | |
155: r = reduceOp(r, smem[22]); | |
156: | |
157: r = reduceOp(r, smem[23]); | |
158: | |
159: r = reduceOp(r, smem[24]); | |
160: | |
161: r = reduceOp(r, smem[25]); | |
162: | |
163: r = reduceOp(r, smem[26]); | |
164: | |
165: r = reduceOp(r, smem[27]); | |
166: | |
167: r = reduceOp(r, smem[28]); | |
168: | |
169: r = reduceOp(r, smem[29]); | |
170: | |
171: r = reduceOp(r, smem[30]); | |
172: | |
173: r = reduceOp(r, smem[31]); | |
174: | |
175: } else { | |
176: for (int i = 1; i < numLanesParticipating; ++i) { | |
177: r = reduceOp(r, smem[i]); | |
178: } | |
179: } | |
180: } | |
181: | |
182: return r; | |
183: } | |
184: | |
185: | |
186: | |
187: | |
188: // Kernel that handles an entire reduction of a tensor in one pass | |
189: kernel void | |
190: THClTensor_reduceAll(global TensorInfoCl *in_info, | |
191: global float *in_data, | |
192: unsigned int totalElements, | |
193: float init, | |
194: global float* out, | |
195: local float *smem) { | |
196: // With a block-wide stride, have each thread perform its own reduction. | |
197: float r = init; | |
198: for (unsigned int i = get_local_id(0); i < totalElements; i += get_local_size(0)) { | |
199: const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]); | |
200: r = reduceOp(r, modifyOp(in_data[inOffset])); | |
201: } | |
202: | |
203: // Reduce within the block | |
204: r = reduceBlock(smem, get_local_size(0), r, init); | |
205: | |
206: if(get_local_id(0) == 0) { | |
207: // Write out reduced value | |
208: out[0] = r; | |
209: } | |
210: } | |
211: | |
212: inline unsigned int getStartIndex(unsigned int totalSize) { | |
213: unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0)); | |
214: return get_group_id(0) * sizePerBlock; | |
215: } | |
216: | |
217: inline unsigned int getEndIndex(unsigned int totalSize) { | |
218: unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0)); | |
219: return min((unsigned int) ((get_group_id(0) + 1) * sizePerBlock), totalSize); | |
220: } | |
221: | |
222: // Kernel that handles an entire reduction of a tensor in two passes | |
223: kernel void | |
224: THClTensor_reduceAllPass1(global TensorInfoCl *in_info, | |
225: global float *in_data, | |
226: unsigned int totalElements, | |
227: float init, | |
228: global float* scratchSpace, | |
229: local float *smem) { | |
230: const unsigned int startIndex = getStartIndex(totalElements); | |
231: const unsigned int endIndex = getEndIndex(totalElements); | |
232: | |
233: // With a block-wide stride, have each thread perform its own reduction. | |
234: float r = init; | |
235: for (unsigned int i = startIndex + get_local_id(0); i < endIndex; i += get_local_size(0)) { | |
236: const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]); | |
237: r = reduceOp(r, modifyOp(in_data[inOffset])); | |
238: } | |
239: | |
240: // Reduce within the block | |
241: r = reduceBlock(smem, get_local_size(0), r, init); | |
242: | |
243: if ((int)get_local_id(0) == 0) { | |
244: // Write out block-wide reduced value | |
245: scratchSpace[get_group_id(0)] = r; | |
246: } | |
247: } | |
248: | |
249: kernel void THClTensor_reduceAllPass2(int numPass1Blocks, | |
250: float init, | |
251: global float* scratchSpace, | |
252: global float* out, | |
253: local float *smem) { | |
254: float r = init; | |
255: if ((int)get_local_id(0) < numPass1Blocks) { | |
256: r = scratchSpace[get_local_id(0)]; | |
257: } | |
258: | |
259: // Reduce within the block | |
260: r = reduceBlock(smem, numPass1Blocks, r, init); | |
261: | |
262: if((int)get_local_id(0) == 0) { | |
263: out[0] = r; | |
264: } | |
265: } | |
266: | |
267: | |
268: | |
269: | |
Something went wrong with clCreateKernel, OpenCL erorr code -45 | |
THClReduceAll.cl build log: | |
unsupported call to function reduceBlock in THClTensor_reduceAll | |
____*________****______________________________________*****_____*|_ ==> mse_nosizeaverageTHClReduceAll.cl build log: | |
unsupported call to function reduceBlock in THClTensor_reduceAll | |
kernel build error: | |
kernel source: | |
1: inline unsigned int THClCeilDiv(unsigned int a, unsigned int b) { | |
2: return (a + b - 1) / b; | |
3: } | |
4: | |
5: | |
6: | |
7: inline float modifyOp(float _in1) { | |
8: float _out; | |
9: float *in1 = &_in1; | |
10: float *out = &_out; | |
11: *out = *in1; | |
12: return _out; | |
13: } | |
14: | |
15: inline float reduceOp(float _in1, float _in2) { | |
16: // I guess the compiler can sort this stuff out :-P | |
17: float _out; | |
18: float *in1 = &_in1; | |
19: float *in2 = &_in2; | |
20: float *out = &_out; | |
21: *out = *in1 + *in2; | |
22: return _out; | |
23: } | |
24: | |
25: // kernel argument that defines tensor layout | |
26: typedef struct TensorInfoCl { | |
27: // Extracts size/stride information for the kernel. | |
28: // Successive dimensions can be collapsed if the size/strides match | |
29: // up and thus there are no holes between the dimensions. This is used | |
30: // to reduce the complexity of the problem. | |
31: // The optional `reduceDim` indicates a reduction dimension for the | |
32: // given tensor, so that the output size for this dimension will be 1. | |
33: | |
34: unsigned int sizes[25]; | |
35: unsigned int strides[25]; | |
36: unsigned int offset; | |
37: int dims; | |
38: } TensorInfoCl; | |
39: // Contiguous tensors of more than one dimension are collapsed down | |
40: // to one tensor | |
41: | |
42: | |
43: // Translate a linear index for the apply to a float* offset; | |
44: // specialized on `Dims` to reduce nvcc compilation time | |
45: | |
46: | |
47: inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) { | |
48: return linearId + info->offset; | |
49: } | |
50: | |
51: inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) { | |
52: unsigned int offset = info->offset; | |
53: | |
54: // Use dynamic dims | |
55: for (int i = info->dims - 1; i >= 0; --i) { | |
56: unsigned int curDimIndex = linearId % info->sizes[i]; | |
57: unsigned int curDimOffset = curDimIndex * info->strides[i]; | |
58: offset += curDimOffset; | |
59: | |
60: linearId /= info->sizes[i]; | |
61: } | |
62: | |
63: return offset; | |
64: } | |
65: | |
66: inline unsigned int getLinearBlockId() { | |
67: return get_group_id(2) * get_num_groups(1) * get_num_groups(0) + | |
68: get_group_id(1) * get_num_groups(0) + | |
69: get_group_id(0); | |
70: } | |
71: | |
72: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will | |
73: // return the reduced value | |
74: | |
75: inline float reduceBlock( local float* smem, | |
76: int numVals, | |
77: float threadVal, | |
78: float init) { | |
79: if (numVals == 0) { | |
80: return init; | |
81: } | |
82: | |
83: if ((int)get_local_id(0) < numVals) { | |
84: smem[ get_local_id(0)] = threadVal; | |
85: } | |
86: | |
87: // First warp will perform reductions across warps | |
88: barrier(CLK_LOCAL_MEM_FENCE); | |
89: if ((get_local_id(0) / 32) == 0) { | |
90: float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init; | |
91: | |
92: for (int i = 32 + get_local_id(0); i < numVals; i += 32) { | |
93: r = reduceOp(r, smem[i]); | |
94: } | |
95: | |
96: smem[get_local_id(0)] = r; | |
97: } | |
98: | |
99: // First thread will perform reductions across the block | |
100: barrier(CLK_LOCAL_MEM_FENCE); | |
101: | |
102: float r = init; | |
103: if (get_local_id(0) == 0) { | |
104: r = smem[0]; | |
105: | |
106: int numLanesParticipating = min(numVals, 32); | |
107: | |
108: if (numLanesParticipating == 32) { | |
109: // Unroll for 32 == 32 and numVals >= 32 | |
110: // #pragma unroll | |
111: // unrolling by hand, so compiler-independent | |
112: | |
113: r = reduceOp(r, smem[1]); | |
114: | |
115: r = reduceOp(r, smem[2]); | |
116: | |
117: r = reduceOp(r, smem[3]); | |
118: | |
119: r = reduceOp(r, smem[4]); | |
120: | |
121: r = reduceOp(r, smem[5]); | |
122: | |
123: r = reduceOp(r, smem[6]); | |
124: | |
125: r = reduceOp(r, smem[7]); | |
126: | |
127: r = reduceOp(r, smem[8]); | |
128: | |
129: r = reduceOp(r, smem[9]); | |
130: | |
131: r = reduceOp(r, smem[10]); | |
132: | |
133: r = reduceOp(r, smem[11]); | |
134: | |
135: r = reduceOp(r, smem[12]); | |
136: | |
137: r = reduceOp(r, smem[13]); | |
138: | |
139: r = reduceOp(r, smem[14]); | |
140: | |
141: r = reduceOp(r, smem[15]); | |
142: | |
143: r = reduceOp(r, smem[16]); | |
144: | |
145: r = reduceOp(r, smem[17]); | |
146: | |
147: r = reduceOp(r, smem[18]); | |
148: | |
149: r = reduceOp(r, smem[19]); | |
150: | |
151: r = reduceOp(r, smem[20]); | |
152: | |
153: r = reduceOp(r, smem[21]); | |
154: | |
155: r = reduceOp(r, smem[22]); | |
156: | |
157: r = reduceOp(r, smem[23]); | |
158: | |
159: r = reduceOp(r, smem[24]); | |
160: | |
161: r = reduceOp(r, smem[25]); | |
162: | |
163: r = reduceOp(r, smem[26]); | |
164: | |
165: r = reduceOp(r, smem[27]); | |
166: | |
167: r = reduceOp(r, smem[28]); | |
168: | |
169: r = reduceOp(r, smem[29]); | |
170: | |
171: r = reduceOp(r, smem[30]); | |
172: | |
173: r = reduceOp(r, smem[31]); | |
174: | |
175: } else { | |
176: for (int i = 1; i < numLanesParticipating; ++i) { | |
177: r = reduceOp(r, smem[i]); | |
178: } | |
179: } | |
180: } | |
181: | |
182: return r; | |
183: } | |
184: | |
185: | |
186: | |
187: | |
188: // Kernel that handles an entire reduction of a tensor in one pass | |
189: kernel void | |
190: THClTensor_reduceAll(global TensorInfoCl *in_info, | |
191: global float *in_data, | |
192: unsigned int totalElements, | |
193: float init, | |
194: global float* out, | |
195: local float *smem) { | |
196: // With a block-wide stride, have each thread perform its own reduction. | |
197: float r = init; | |
198: for (unsigned int i = get_local_id(0); i < totalElements; i += get_local_size(0)) { | |
199: const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]); | |
200: r = reduceOp(r, modifyOp(in_data[inOffset])); | |
201: } | |
202: | |
203: // Reduce within the block | |
204: r = reduceBlock(smem, get_local_size(0), r, init); | |
205: | |
206: if(get_local_id(0) == 0) { | |
207: // Write out reduced value | |
208: out[0] = r; | |
209: } | |
210: } | |
211: | |
212: inline unsigned int getStartIndex(unsigned int totalSize) { | |
213: unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0)); | |
214: return get_group_id(0) * sizePerBlock; | |
215: } | |
216: | |
217: inline unsigned int getEndIndex(unsigned int totalSize) { | |
218: unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0)); | |
219: return min((unsigned int) ((get_group_id(0) + 1) * sizePerBlock), totalSize); | |
220: } | |
221: | |
222: // Kernel that handles an entire reduction of a tensor in two passes | |
223: kernel void | |
224: THClTensor_reduceAllPass1(global TensorInfoCl *in_info, | |
225: global float *in_data, | |
226: unsigned int totalElements, | |
227: float init, | |
228: global float* scratchSpace, | |
229: local float *smem) { | |
230: const unsigned int startIndex = getStartIndex(totalElements); | |
231: const unsigned int endIndex = getEndIndex(totalElements); | |
232: | |
233: // With a block-wide stride, have each thread perform its own reduction. | |
234: float r = init; | |
235: for (unsigned int i = startIndex + get_local_id(0); i < endIndex; i += get_local_size(0)) { | |
236: const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]); | |
237: r = reduceOp(r, modifyOp(in_data[inOffset])); | |
238: } | |
239: | |
240: // Reduce within the block | |
241: r = reduceBlock(smem, get_local_size(0), r, init); | |
242: | |
243: if ((int)get_local_id(0) == 0) { | |
244: // Write out block-wide reduced value | |
245: scratchSpace[get_group_id(0)] = r; | |
246: } | |
247: } | |
248: | |
249: kernel void THClTensor_reduceAllPass2(int numPass1Blocks, | |
250: float init, | |
251: global float* scratchSpace, | |
252: global float* out, | |
253: local float *smem) { | |
254: float r = init; | |
255: if ((int)get_local_id(0) < numPass1Blocks) { | |
256: r = scratchSpace[get_local_id(0)]; | |
257: } | |
258: | |
259: // Reduce within the block | |
260: r = reduceBlock(smem, numPass1Blocks, r, init); | |
261: | |
262: if((int)get_local_id(0) == 0) { | |
263: out[0] = r; | |
264: } | |
265: } | |
266: | |
267: | |
268: | |
269: | |
Something went wrong with clCreateKernel, OpenCL erorr code -45 | |
THClReduceAll.cl build log: | |
unsupported call to function reduceBlock in THClTensor_reduceAll | |
____*________****______________________________________*****_____**| ==> mse_variablebatchsizeTHClReduceAll.cl build log: | |
unsupported call to function reduceBlock in THClTensor_reduceAll | |
kernel build error: | |
kernel source: | |
1: inline unsigned int THClCeilDiv(unsigned int a, unsigned int b) { | |
2: return (a + b - 1) / b; | |
3: } | |
4: | |
5: | |
6: | |
7: inline float modifyOp(float _in1) { | |
8: float _out; | |
9: float *in1 = &_in1; | |
10: float *out = &_out; | |
11: *out = *in1; | |
12: return _out; | |
13: } | |
14: | |
15: inline float reduceOp(float _in1, float _in2) { | |
16: // I guess the compiler can sort this stuff out :-P | |
17: float _out; | |
18: float *in1 = &_in1; | |
19: float *in2 = &_in2; | |
20: float *out = &_out; | |
21: *out = *in1 + *in2; | |
22: return _out; | |
23: } | |
24: | |
25: // kernel argument that defines tensor layout | |
26: typedef struct TensorInfoCl { | |
27: // Extracts size/stride information for the kernel. | |
28: // Successive dimensions can be collapsed if the size/strides match | |
29: // up and thus there are no holes between the dimensions. This is used | |
30: // to reduce the complexity of the problem. | |
31: // The optional `reduceDim` indicates a reduction dimension for the | |
32: // given tensor, so that the output size for this dimension will be 1. | |
33: | |
34: unsigned int sizes[25]; | |
35: unsigned int strides[25]; | |
36: unsigned int offset; | |
37: int dims; | |
38: } TensorInfoCl; | |
39: // Contiguous tensors of more than one dimension are collapsed down | |
40: // to one tensor | |
41: | |
42: | |
43: // Translate a linear index for the apply to a float* offset; | |
44: // specialized on `Dims` to reduce nvcc compilation time | |
45: | |
46: | |
47: inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) { | |
48: return linearId + info->offset; | |
49: } | |
50: | |
51: inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) { | |
52: unsigned int offset = info->offset; | |
53: | |
54: // Use dynamic dims | |
55: for (int i = info->dims - 1; i >= 0; --i) { | |
56: unsigned int curDimIndex = linearId % info->sizes[i]; | |
57: unsigned int curDimOffset = curDimIndex * info->strides[i]; | |
58: offset += curDimOffset; | |
59: | |
60: linearId /= info->sizes[i]; | |
61: } | |
62: | |
63: return offset; | |
64: } | |
65: | |
66: inline unsigned int getLinearBlockId() { | |
67: return get_group_id(2) * get_num_groups(1) * get_num_groups(0) + | |
68: get_group_id(1) * get_num_groups(0) + | |
69: get_group_id(0); | |
70: } | |
71: | |
72: // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will | |
73: // return the reduced value | |
74: | |
75: inline float reduceBlock( local float* smem, | |
76: int numVals, | |
77: float threadVal, | |
78: float init) { | |
79: if (numVals == 0) { | |
80: return init; | |
81: } | |
82: | |
83: if ((int)get_local_id(0) < numVals) { | |
84: smem[ get_local_id(0)] = threadVal; | |
85: } | |
86: | |
87: // First warp will perform reductions across warps | |
88: barrier(CLK_LOCAL_MEM_FENCE); | |
89: if ((get_local_id(0) / 32) == 0) { | |
90: float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init; | |
91: | |
92: for (int i = 32 + get_local_id(0); i < numVals; i += 32) { | |
93: r = reduceOp(r, smem[i]); | |
94: } | |
95: | |
96: smem[get_local_id(0)] = r; | |
97: } | |
98: | |
99: // First thread will perform reductions across the block | |
100: barrier(CLK_LOCAL_MEM_FENCE); | |
101: | |
102: float r = init; | |
103: if (get_local_id(0) == 0) { | |
104: r = smem[0]; | |
105: | |
106: int numLanesParticipating = min(numVals, 32); | |
107: | |
108: if (numLanesParticipating == 32) { | |
109: // Unroll for 32 == 32 and numVals >= 32 | |
110: // #pragma unroll | |
111: // unrolling by hand, so compiler-independent | |
112: | |
113: r = reduceOp(r, smem[1]); | |
114: | |
115: r = reduceOp(r, smem[2]); | |
116: | |
117: r = reduceOp(r, smem[3]); | |
118: | |
119: r = reduceOp(r, smem[4]); | |
120: | |
121: r = reduceOp(r, smem[5]); | |
122: | |
123: r = reduceOp(r, smem[6]); | |
124: | |
125: r = reduceOp(r, smem[7]); | |
126: | |
127: r = reduceOp(r, smem[8]); | |
128: | |
129: r = reduceOp(r, smem[9]); | |
130: | |
131: r = reduceOp(r, smem[10]); | |
132: | |
133: r = reduceOp(r, smem[11]); | |
134: | |
135: r = reduceOp(r, smem[12]); | |
136: | |
137: r = reduceOp(r, smem[13]); | |
138: | |
139: r = reduceOp(r, smem[14]); | |
140: | |
141: r = reduceOp(r, smem[15]); | |
142: | |
143: r = reduceOp(r, smem[16]); | |
144: | |
145: r = reduceOp(r, smem[17]); | |
146: | |
147: r = reduceOp(r, smem[18]); | |
148: | |
149: r = reduceOp(r, smem[19]); | |
150: | |
151: r = reduceOp(r, smem[20]); | |
152: | |
153: r = reduceOp(r, smem[21]); | |
154: | |
155: r = reduceOp(r, smem[22]); | |
156: | |
157: r = reduceOp(r, smem[23]); | |
158: | |
159: r = reduceOp(r, smem[24]); | |
160: | |
161: r = reduceOp(r, smem[25]); | |
162: | |
163: r = reduceOp(r, smem[26]); | |
164: | |
165: r = reduceOp(r, smem[27]); | |
166: | |
167: r = reduceOp(r, smem[28]); | |
168: | |
169: r = reduceOp(r, smem[29]); | |
170: | |
171: r = reduceOp(r, smem[30]); | |
172: | |
173: r = reduceOp(r, smem[31]); | |
174: | |
175: } else { | |
176: for (int i = 1; i < numLanesParticipating; ++i) { | |
177: r = reduceOp(r, smem[i]); | |
178: } | |
179: } | |
180: } | |
181: | |
182: return r; | |
183: } | |
184: | |
185: | |
186: | |
187: | |
188: // Kernel that handles an entire reduction of a tensor in one pass | |
189: kernel void | |
190: THClTensor_reduceAll(global TensorInfoCl *in_info, | |
191: global float *in_data, | |
192: unsigned int totalElements, | |
193: float init, | |
194: global float* out, | |
195: local float *smem) { | |
196: // With a block-wide stride, have each thread perform its own reduction. | |
197: float r = init; | |
198: for (unsigned int i = get_local_id(0); i < totalElements; i += get_local_size(0)) { | |
199: const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]); | |
200: r = reduceOp(r, modifyOp(in_data[inOffset])); | |
201: } | |
202: | |
203: // Reduce within the block | |
204: r = reduceBlock(smem, get_local_size(0), r, init); | |
205: | |
206: if(get_local_id(0) == 0) { | |
207: // Write out reduced value | |
208: out[0] = r; | |
209: } | |
210: } | |
211: | |
212: inline unsigned int getStartIndex(unsigned int totalSize) { | |
213: unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0)); | |
214: return get_group_id(0) * sizePerBlock; | |
215: } | |
216: | |
217: inline unsigned int getEndIndex(unsigned int totalSize) { | |
218: unsigned int sizePerBlock = THClCeilDiv(totalSize, (unsigned int) get_num_groups(0)); | |
219: return min((unsigned int) ((get_group_id(0) + 1) * sizePerBlock), totalSize); | |
220: } | |
221: | |
222: // Kernel that handles an entire reduction of a tensor in two passes | |
223: kernel void | |
224: THClTensor_reduceAllPass1(global TensorInfoCl *in_info, | |
225: global float *in_data, | |
226: unsigned int totalElements, | |
227: float init, | |
228: global float* scratchSpace, | |
229: local float *smem) { | |
230: const unsigned int startIndex = getStartIndex(totalElements); | |
231: const unsigned int endIndex = getEndIndex(totalElements); | |
232: | |
233: // With a block-wide stride, have each thread perform its own reduction. | |
234: float r = init; | |
235: for (unsigned int i = startIndex + get_local_id(0); i < endIndex; i += get_local_size(0)) { | |
236: const unsigned int inOffset = IndexToOffset_998_get(i, &in_info[0]); | |
237: r = reduceOp(r, modifyOp(in_data[inOffset])); | |
238: } | |
239: | |
240: // Reduce within the block | |
241: r = reduceBlock(smem, get_local_size(0), r, init); | |
242: | |
243: if ((int)get_local_id(0) == 0) { | |
244: // Write out block-wide reduced value | |
245: scratchSpace[get_group_id(0)] = r; | |
246: } | |
247: } | |
248: | |
249: kernel void THClTensor_reduceAllPass2(int numPass1Blocks, | |
250: float init, | |
251: global float* scratchSpace, | |
252: global float* out, | |
253: local float *smem) { | |
254: float r = init; | |
255: if ((int)get_local_id(0) < numPass1Blocks) { | |
256: r = scratchSpace[get_local_id(0)]; | |
257: } | |
258: | |
259: // Reduce within the block | |
260: r = reduceBlock(smem, numPass1Blocks, r, init); | |
261: | |
262: if((int)get_local_id(0) == 0) { | |
263: out[0] = r; | |
264: } | |
265: } | |
266: | |
267: | |
268: | |
269: | |
Something went wrong with clCreateKernel, OpenCL erorr code -45 | |
THClReduceAll.cl build log: | |
unsupported call to function reduceBlock in THClTensor_reduceAll | |
____*________****______________________________________*****_____*** ==> Done | |
Completed 82 asserts in 68 tests with 13 errors | |
-------------------------------------------------------------------------------- | |
ClassNLLCriterionMultipleTarget | |
Function call failed | |
C++ exception | |
-------------------------------------------------------------------------------- | |
LogSoftMax_backward | |
Function call failed | |
C++ exception | |
-------------------------------------------------------------------------------- | |
LogSoftMax_backward_batch | |
Function call failed | |
C++ exception | |
-------------------------------------------------------------------------------- | |
LogSoftMax_forward | |
Function call failed | |
C++ exception | |
-------------------------------------------------------------------------------- | |
LogSoftMax_forward_batch | |
Function call failed | |
C++ exception | |
-------------------------------------------------------------------------------- | |
Sum_backward | |
Function call failed | |
/home/lcestari/torch/install/share/lua/5.1/nn/Sum.lua:27: Something went wrong: | |
kernel source: | |
1: // Threads per thread block | |
2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16 | |
3: | |
4: inline float modifyOp(float _in1) { | |
5: float _out; | |
6: float *in1 = &_in1; | |
7: float *out = &_out; | |
8: *out = *in1; | |
9: return _out; | |
10: } | |
11: | |
12: inline float reduceOp(float _in1, float _in2) { | |
13: // I guess the compiler can sort this stuff out :-P | |
14: float _out; | |
15: float *in1 = &_in1; | |
16: float *in2 = &_in2; | |
17: float *out = &_out; | |
18: *out = *in1 + *in2; | |
19: return _out; | |
20: } | |
21: | |
22: // kernel argument that defines tensor layout | |
23: typedef struct TensorInfoCl { | |
24: // Extracts size/stride information for the kernel. | |
25: // Successive dimensions can be collapsed if the size/strides match | |
26: // up and thus there are no holes between the dimensions. This is used | |
27: // to reduce the complexity of the problem. | |
28: // The optional `reduceDim` indicates a reduction dimension for the | |
29: // given tensor, so that the output size for this dimension will be 1. | |
30: | |
31: int sizes[25]; | |
32: int strides[25]; | |
33: int offset; | |
34: int dims; | |
35: } TensorInfoCl; | |
36: // Contiguous tensors of more than one dimension are collapsed down | |
37: // to one tensor | |
38: | |
39: | |
40: // Translate a linear index for the apply to a float* offset; | |
41: // specialized on `Dims` to reduce nvcc compilation time | |
42: | |
43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) { | |
44: int offset = info->offset; | |
45: | |
46: // Use static dims | |
47: // for (int i = 1 - 1; i >= 0; --i) { | |
48: int curDimIndex; | |
49: int curDimOffset; | |
50: // bake this in.... | |
51: curDimIndex = linearId 64nfo->sizes[0]; | |
52: curDimOffset = curDimIndex * info->strides[0]; | |
53: offset += curDimOffset; | |
54: | |
55: | |
56: | |
57: // } | |
58: | |
59: return offset; | |
60: } | |
61: | |
62: | |
63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) { | |
64: return linearId + info->offset; | |
65: } | |
66: | |
67: inline int IndexToOffset_999_get(int linearId, global const Te | |
stack traceback: | |
[C]: in function 'sum' | |
/home/lcestari/torch/install/share/lua/5.1/nn/Sum.lua:27: in function 'forward' | |
/home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:731: in function 'v' | |
/home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:2557: in function </home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:2555> | |
[C]: in function 'xpcall' | |
/home/lcestari/torch/install/share/lua/5.1/torch/Tester.lua:115: in function 'pcall' | |
/home/lcestari/torch/install/share/lua/5.1/torch/Tester.lua:186: in function '_run' | |
/home/lcestari/torch/install/share/lua/5.1/torch/Tester.lua:161: in function 'run' | |
/home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:2596: in function 'test' | |
(command line):1: in main chunk | |
[C]: at 0x00405c20 | |
-------------------------------------------------------------------------------- | |
Sum_forward | |
Function call failed | |
/home/lcestari/torch/install/share/lua/5.1/nn/Sum.lua:27: Something went wrong: | |
kernel source: | |
1: // Threads per thread block | |
2: #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16 | |
3: | |
4: inline float modifyOp(float _in1) { | |
5: float _out; | |
6: float *in1 = &_in1; | |
7: float *out = &_out; | |
8: *out = *in1; | |
9: return _out; | |
10: } | |
11: | |
12: inline float reduceOp(float _in1, float _in2) { | |
13: // I guess the compiler can sort this stuff out :-P | |
14: float _out; | |
15: float *in1 = &_in1; | |
16: float *in2 = &_in2; | |
17: float *out = &_out; | |
18: *out = *in1 + *in2; | |
19: return _out; | |
20: } | |
21: | |
22: // kernel argument that defines tensor layout | |
23: typedef struct TensorInfoCl { | |
24: // Extracts size/stride information for the kernel. | |
25: // Successive dimensions can be collapsed if the size/strides match | |
26: // up and thus there are no holes between the dimensions. This is used | |
27: // to reduce the complexity of the problem. | |
28: // The optional `reduceDim` indicates a reduction dimension for the | |
29: // given tensor, so that the output size for this dimension will be 1. | |
30: | |
31: int sizes[25]; | |
32: int strides[25]; | |
33: int offset; | |
34: int dims; | |
35: } TensorInfoCl; | |
36: // Contiguous tensors of more than one dimension are collapsed down | |
37: // to one tensor | |
38: | |
39: | |
40: // Translate a linear index for the apply to a float* offset; | |
41: // specialized on `Dims` to reduce nvcc compilation time | |
42: | |
43: inline int IndexToOffset_1001_get( int linearId, global TensorInfoCl *info) { | |
44: int offset = info->offset; | |
45: | |
46: // Use static dims | |
47: // for (int i = 1 - 1; i >= 0; --i) { | |
48: int curDimIndex; | |
49: int curDimOffset; | |
50: // bake this in.... | |
51: curDimIndex = linearId 64nfo->sizes[0]; | |
52: curDimOffset = curDimIndex * info->strides[0]; | |
53: offset += curDimOffset; | |
54: | |
55: | |
56: | |
57: // } | |
58: | |
59: return offset; | |
60: } | |
61: | |
62: | |
63: inline int IndexToOffset_998_get(int linearId, global const TensorInfoCl *info) { | |
64: return linearId + info->offset; | |
65: } | |
66: | |
67: inline int IndexToOffset_999_get(int linearId, global const Te | |
stack traceback: | |
[C]: in function 'sum' | |
/home/lcestari/torch/install/share/lua/5.1/nn/Sum.lua:27: in function 'forward' | |
/home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:697: in function 'v' | |
/home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:2557: in function </home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:2555> | |
[C]: in function 'xpcall' | |
/home/lcestari/torch/install/share/lua/5.1/torch/Tester.lua:115: in function 'pcall' | |
/home/lcestari/torch/install/share/lua/5.1/torch/Tester.lua:186: in function '_run' | |
/home/lcestari/torch/install/share/lua/5.1/torch/Tester.lua:161: in function 'run' | |
/home/lcestari/torch/install/share/lua/5.1/clnn/test.lua:2596: in function 'test' | |
(command line):1: in main chunk | |
[C]: at 0x00405c20 | |
-------------------------------------------------------------------------------- | |
Tanh_backward | |
Function call failed | |
C++ exception | |
-------------------------------------------------------------------------------- | |
Tanh_forward | |
Function call failed | |
C++ exception | |
-------------------------------------------------------------------------------- | |
Tanh_transposed | |
Function call failed | |
C++ exception | |
-------------------------------------------------------------------------------- | |
mse | |
Function call failed | |
C++ exception | |
-------------------------------------------------------------------------------- | |
mse_nosizeaverage | |
Function call failed | |
C++ exception | |
-------------------------------------------------------------------------------- | |
mse_variablebatchsize | |
Function call failed | |
C++ exception | |
-------------------------------------------------------------------------------- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment