Created
July 27, 2016 03:44
-
-
Save marty1885/b0a21304e605502faa6c7bd224788a91 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
major_version 2 | |
major vresion 2 | |
xor | |
Using Intel , OpenCL platform: Intel Gen OCL Driver | |
Using OpenCL device: Intel(R) HD Graphics Skylake Desktop GT2 | |
initializing clblas | |
layer 0:InputLayer{ outputPlanes=2 outputSize=1 } | |
layer 1:ConvolutionalLayer{ LayerDimensions{ inputPlanes=2 inputSize=1 numFilters=2 filterSize=1 outputSize=1 padZeros=1 biased=1 skip=0} } | |
layer 2:ActivationLayer{ SIGMOID } | |
layer 3:ConvolutionalLayer{ LayerDimensions{ inputPlanes=2 inputSize=1 numFilters=2 filterSize=1 outputSize=1 padZeros=1 biased=1 skip=0} } | |
layer 4:ActivationLayer{ SIGMOID } | |
layer 5:SoftMaxLayer{ perPlane=0 numPlanes=2 imageSize=1 } | |
statefultimer v0.7 | |
forward try kernel 0 | |
... not plausibly optimal, skipping | |
forward try kernel 1 | |
... seems valid | |
ForwardAuto: kernel 1 0ms | |
forward try kernel 0 | |
... not plausibly optimal, skipping | |
forward try kernel 1 | |
... seems valid | |
ForwardAuto: kernel 1 0ms | |
backward try kernel 0 | |
... not plausibly optimal, skipping | |
backward try kernel 1 | |
... seems valid | |
BackwardAuto: kernel 1 0ms | |
calcGradWeights try kernel 0 | |
... not plausibly optimal, skipping | |
calcGradWeights try kernel 1 | |
... seems valid | |
BackpropWeightsAuto: kernel 1 0ms | |
calcGradWeights try kernel 0 | |
... not plausibly optimal, skipping | |
calcGradWeights try kernel 1 | |
... seems valid | |
BackpropWeightsAuto: kernel 1 0ms | |
after epoch 1 284 ms | |
training loss: 2.77513 | |
train accuracy: 2/4 50% | |
forward try kernel 2 | |
... seems valid | |
ForwardAuto: kernel 2 7473ms | |
forward try kernel 2 | |
... seems valid | |
drm_intel_gem_bo_context_exec() failed: Input/output error | |
ForwardAuto: kernel 2 this instance cant be used: | |
kernel source: | |
1: // Copyright Hugh Perkins 2015 hughperkins at gmail | |
2: // | |
3: // This Source Code Form is subject to the terms of the Mozilla Public License, | |
4: // v. 2.0. If a copy of the MPL was not distributed with this file, You can | |
5: // obtain one at http://mozilla.org/MPL/2.0/. | |
6: | |
7: kernel void per_element_add(const int N, global float *target, global const float *source) { | |
8: const int globalId = get_global_id(0); | |
9: if (globalId >= N) { | |
10: return; | |
11: } | |
12: target[globalId] += source[globalId]; | |
13: } | |
14: | |
15: // adds source to target | |
16: // tiles source as necessary, according to tilingSize | |
17: kernel void per_element_tiled_add(const int N, const int tilingSize, global float *target, global const float *source) { | |
18: const int globalId = get_global_id(0); | |
19: if (globalId >= N) { | |
20: return; | |
21: } | |
22: target[globalId] += source[globalId % tilingSize]; | |
23: } | |
24: | |
25: kernel void repeated_add(const int N, const int sourceSize, const int repeatSize, global float *target, global const float *source) { | |
26: const int globalId = get_global_id(0); | |
27: if (globalId >= N) { | |
28: return; | |
29: } | |
30: target[globalId] += source[ (globalId / repeatSize) % sourceSize ]; | |
31: } | |
32: | |
33: | |
Out of resources, code -5 | |
forward try kernel 3 | |
... seems valid | |
drm_intel_gem_bo_context_exec() failed: Input/output error | |
ForwardAuto: kernel 3 this instance cant be used: | |
kernel source: | |
1: // Copyright Hugh Perkins 2014, 2015 hughperkins at gmail | |
2: // | |
3: // This Source Code Form is subject to the terms of the Mozilla Public License, | |
4: // v. 2.0. If a copy of the MPL was not distributed with this file, You can | |
5: // obtain one at http://mozilla.org/MPL/2.0/. | |
6: | |
7: // concept: each workgroup handles convolving one input example with one filtercube | |
8: // and writing out one single output plane | |
9: // | |
10: // workgroup id organized like: [imageid][outplane] | |
11: // local id organized like: [outrow][outcol] | |
12: // each thread iterates over: [upstreamplane][filterrow][filtercol] | |
13: // number workgroups = 32 | |
14: // one filter plane takes up 5 * 5 * 4 = 100 bytes | |
15: // one filter cube (corresponding to one outplane) = 5*5 * 32 * 4 = 3.2KB (ok) | |
16: // all filter cubes = 3.2KB * 32 = 102KB (too big) | |
17: // output are organized like [imageid][filterid][row][col] | |
18: void kernel forward_3_by_n_outplane(const int batchSize, | |
19: global const float *images, global const float *filters, | |
20: global float *output, | |
21: local float *_upstreamImage, local float *_filterCube) { | |
22: const int globalId = get_global_id(0); | |
23: | |
24: const int workgroupId = get_group_id(0); | |
25: const int workgroupSize = get_local_size(0); | |
26: const int n = workgroupId / gNumFilters; | |
27: const int outPlane = workgroupId % gNumFilters; | |
28: | |
29: const int localId = get_local_id(0); | |
30: const int outputRow = localId / gOutputSize; | |
31: const int outputCol = localId % gOutputSize; | |
32: | |
33: const int minu = gPadZeros ? max(-gHalfFilterSize, -outputRow) : -gHalfFilterSize; | |
34: const int maxu = gPadZeros ? min(gHalfFilterSize - gEven, gOutputSize - 1 - outputRow - gEven) : gHalfFilterSize - gEven; | |
35: const int minv = gPadZeros ? max(-gHalfFilterSize, -outputCol) : - gHalfFilterSize; | |
36: const int maxv = gPadZeros ? min(gHalfFilterSize - gEven, gOutputSize - 1 - outputCol - gEven) : gHalfFilterSize - gEven; | |
37: | |
38: const int numUpstreamsPerThread = (gInputSizeSquared + workgroupSize - 1) / workgroupSize; | |
39: | |
40: const int filterCubeLength = gInputPlanes * gFilterSizeSquared; | |
41: const int filterCubeGlobalOffset = outPlane * filterCubeLength; | |
42: const int numPixelsPerThread = (filterCubeLength + workgroupSize - 1) / workgroupSize; | |
43: for (int i = 0; i < numPixelsPerThread; i++) { | |
44: int thisOffset = localId + i * workgroupSize; | |
45: if (thisOffset < filterCubeLength) { | |
46: _filterCube[thisOffset] = filters[filterCubeGlobalOffset + thisOffset]; | |
47: } | |
48: } | |
49: // dont need a barrier, since we'll just run behind the barrier from the upstream image download | |
50: | |
51: float sum = 0; | |
52: for (int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++) { | |
53: int thisUpstreamImageOffset = (n * gInputPlanes + upstreamPlane) * gInputSizeSquared; | |
54: barrier(CLK_LOCAL_MEM_FENCE); | |
55: for (int i = 0; i < numUpstreamsPerThread; i++) { | |
56: int thisOffset = workgroupSize * i + localId; | |
57: if (thisOffset < gInputSizeSquared) { | |
58: _upstreamImage[ thisOffset ] = images[ thisUpstreamImageOffset + thisOffset ]; | |
59: } | |
60: } | |
61: barrier(CLK_LOCAL_MEM_FENCE); | |
62: int filterImageOffset = upstreamPlane * gFilterSizeSquared; | |
63: for (int u = minu; u <= maxu; u++) { | |
64: int inputRow = outputRow + u; | |
65: #if gPadZeros == 0 | |
66: inputRow += gHalfFilterSize; | |
67: #endif | |
68: int inputimagerowoffset = inputRow * gInputSize; | |
69: int filterrowoffset = filterImageOffset + (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize; | |
70: for (int v = minv; v <= maxv; v++) { | |
71: int inputCol = outputCol + v; | |
72: #if gPadZeros == 0 | |
73: inputCol += gHalfFilterSize; | |
74: #endif | |
75: if (localId < gOutputSizeSquared) { | |
76: sum += _upstreamImage[ inputimagerowoffset + inputCol] * _filterCube[ filterrowoffset + v ]; | |
77: } | |
78: } | |
79: } | |
80: } | |
81: | |
82: // output are organized like [imageid][filterid][row][col] | |
83: int resultIndex = (n * gNumFilters + outPlane) * gOutputSizeSquared + localId; | |
84: if (localId < gOutputSizeSquared) { | |
85: output[resultIndex ] = sum; | |
86: } | |
87: } | |
88: | |
89: | |
Out of resources, code -5 | |
forward try kernel 4 | |
... seems valid | |
drm_intel_gem_bo_context_exec() failed: Input/output error | |
ForwardAuto: kernel 4 this instance cant be used: | |
kernel source: | |
1: // Copyright Hugh Perkins 2014, 2015 hughperkins at gmail | |
2: // | |
3: // This Source Code Form is subject to the terms of the Mozilla Public License, | |
4: // v. 2.0. If a copy of the MPL was not distributed with this file, You can | |
5: // obtain one at http://mozilla.org/MPL/2.0/. | |
6: | |
7: void copyLocal(local float *target, global float const *source, int N) { | |
8: int numLoops = (N + get_local_size(0) - 1) / get_local_size(0); | |
9: for (int loop = 0; loop < numLoops; loop++) { | |
10: int offset = loop * get_local_size(0) + get_local_id(0); | |
11: if (offset < N) { | |
12: target[offset] = source[offset]; | |
13: } | |
14: } | |
15: } | |
16: | |
17: #ifdef gOutputSize // for previous tests that dont define it | |
18: // workgroup id organized like: [n][filterid] | |
19: // local id organized like: [outrow][outcol] | |
20: // each thread iterates over: [upstreamplane][filterrow][filtercol] | |
21: // number workgroups = 32 | |
22: // one filter plane takes up 5 * 5 * 4 = 100 bytes | |
23: // one filter cube (corresponding to one outplane) = 5*5 * 32 * 4 = 3.2KB (ok) | |
24: // all filter cubes = 3.2KB * 32 = 102KB (too big) | |
25: // output are organized like [n][filterid][outrow][outcol] | |
26: // the pixels per thread thing... : | |
27: // - we have one thread (~= cuda core) per output value, | |
28: // ie one thread for each combination of [outrow][outcol] | |
29: // - however, the number of threads is typically limited on a gpu, | |
30: // eg to 512 (eg Intel HD), or 1024 (eg nVidia K520) | |
31: // - so what happens if the number of output points is larger than | |
32: // the maximum workgroup size? | |
33: // - then we have several possibilities really: | |
34: // - we can divide the image into blocks, and process each block | |
35: // separately. This is probably a good option, but fair amount of | |
36: // work | |
37: // - we can get each thread to handle more than one output | |
38: // pixel, by looping | |
39: // - we can consider the output image in 1d, by putting the rows | |
40: // one after another, and assign each contiguous workgroup-size | |
41: // block to one workgroup | |
42: // => this is how this kernel works | |
43: // basically, it's a hack, so larger images actually run, without | |
44: // crashing, and we can probably improve it a lot :-) | |
45: // | |
46: // So, when outputSize * outputSize > workgroupSize, then | |
47: // multiple workgroups will be created for each output plane | |
48: // the number of such workgroups is given by: `gPixelsPerThread` | |
49: // the id of our workgroup within such a set of workgroups is calculated | |
50: // as `pixel` | |
51: // effectiveLocalId is our local id if we had one enormous workgroup | |
52: // containing the whole output image plane | |
53: void kernel forward_4_by_n_outplane_smallercache(const int batchSize, | |
54: global const float *images, global const float *filters, | |
55: global float *output, | |
56: local float *_inputPlane, local float *_filterPlane) { | |
57: #define globalId (get_global_id(0)) | |
58: | |
59: #define localId (get_local_id(0)) | |
60: #define workgroupId (get_group_id(0)) | |
61: // const int workgroupSize = get_local_size(0); | |
62: const int effectiveWorkgroupId = workgroupId / gPixelsPerThread; | |
63: const int pixel = workgroupId % gPixelsPerThread; | |
64: const int effectiveLocalId = localId + pixel * gWorkgroupSize; | |
65: const int n = effectiveWorkgroupId / gNumFilters; | |
66: const int outPlane = effectiveWorkgroupId % gNumFilters; | |
67: | |
68: const int outputRow = effectiveLocalId / gOutputSize; | |
69: const int outputCol = effectiveLocalId % gOutputSize; | |
70: | |
71: float sum = 0; | |
72: for (int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++) { | |
73: barrier(CLK_LOCAL_MEM_FENCE); | |
74: copyLocal(_inputPlane, images + (n * gInputPlanes + upstreamPlane) * gInputSizeSquared, gInputSizeSquared); | |
75: copyLocal(_filterPlane, filters + (outPlane * gInputPlanes + upstreamPlane) * gFilterSizeSquared, gFilterSizeSquared); | |
76: barrier(CLK_LOCAL_MEM_FENCE); | |
77: | |
78: if (effectiveLocalId < gOutputSizeSquared) { | |
79: for (int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++) { | |
80: // trying to reduce register pressure... | |
81: #if gPadZeros == 1 | |
82: #define inputRow (outputRow + u) | |
83: #else | |
84: #define inputRow (outputRow + u + gHalfFilterSize) | |
85: #endif | |
86: int inputimagerowoffset = inputRow * gInputSize; | |
87: int filterrowoffset = (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize; | |
88: bool rowOk = inputRow >= 0 && inputRow < gInputSize; | |
89: for (int v = -gHalfFilterSize; v <= gHalfFilterSize - gEven; v++) { | |
90: #if gPadZeros == 1 | |
91: #define inputCol (outputCol + v) | |
92: #else | |
93: #define inputCol (outputCol + v + gHalfFilterSize) | |
94: #endif | |
95: bool process = rowOk && inputCol >= 0 && inputCol < gInputSize; | |
96: if (process) { | |
97: sum += _inputPlane[ inputimagerowoffset + inputCol] * _filterPlane[ filterrowoffset + v ]; | |
98: } | |
99: } | |
100: } | |
101: } | |
102: } | |
103: // output are organized like [imageid][filterid][row][col] | |
104: #define resultIndex (( n * gNumFilters + outPlane) * gOutputSizeSquared + effectiveLocalId) | |
105: if (effectiveLocalId < gOutputSizeSquared) { | |
106: output[resultIndex ] = sum; | |
107: } | |
108: } | |
109: #endif | |
110: | |
111: | |
Out of resources, code -5 | |
forward try kernel 5 | |
ForwardAuto: kernel 5: this instance cant be used: For ForwardFc, padzeros must be disabled | |
... not valid | |
forward try kernel 6 | |
... seems valid | |
drm_intel_gem_bo_context_exec() failed: Input/output error | |
ForwardAuto: kernel 6 this instance cant be used: | |
kernel source: | |
1: // Copyright Hugh Perkins 2014, 2015 hughperkins at gmail | |
2: // | |
3: // This Source Code Form is subject to the terms of the Mozilla Public License, | |
4: // v. 2.0. If a copy of the MPL was not distributed with this file, You can | |
5: // obtain one at http://mozilla.org/MPL/2.0/. | |
6: | |
7: // concept: | |
8: // - load same input plane from each image | |
9: // - hold filter plane for this input plane, for all filters | |
10: // - reduce afterwards | |
11: // local memory for one plane from each filter of 64c7 = 64 * 7 * 7 * 4 = 12.5KB | |
12: // local memory for one single input plane = 19 * 19 * 4 = 1.4KB | |
13: // => seems ok? | |
14: // workgroupid: [inputPlaneId] | |
15: // localid: [filterId][outRow] (if this is more than workgroupsize, we should reuse some threads...) | |
16: // iterate over: [n][outCol] | |
17: // output: [n][filterId][outRow][outCol][inputPlane] | |
18: // need to later reduce output over: [inputPlane] | |
19: void kernel forward_byinputplane(const int batchSize, | |
20: global const float *images, global const float *filters, | |
21: global float *output, | |
22: local float *_inputPlane, local float *_filterPlanes) { | |
23: // const int evenPadding = gFilterSize % 2 == 0 ? 1 : 0; | |
24: | |
25: const int globalId = get_global_id(0); | |
26: const int workgroupId = get_group_id(0); | |
27: const int workgroupSize = get_local_size(0); | |
28: const int localId = get_local_id(0); | |
29: | |
30: const int inputPlaneId = workgroupId; | |
31: const int numLoops = (gNumFilters * gOutputSize + workgroupSize - 1) / workgroupSize; | |
32: const int numFilterCopyLoops = (gFilterSizeSquared + gOutputSize - 1) / gOutputSize; | |
33: const int numImageCopyLoops = (gInputSizeSquared + workgroupSize - 1) / workgroupSize; | |
34: for (int loop = 0; loop < numLoops; loop++) { | |
35: const int loopLocalId = localId + loop * workgroupSize; | |
36: const int filterId = loopLocalId / gOutputSize; | |
37: const int outRow = loopLocalId % gOutputSize; | |
38: | |
39: // copy down our filter, we have gOutputSize threads to do this | |
40: global float const *globalFilterPlane = filters + | |
41: (filterId * gNumInputPlanes + inputPlaneId) * gFilterSizeSquared; | |
42: local float *_localFilterPlane = _filterPlanes + filterId * gFilterSizeSquared; | |
43: barrier(CLK_LOCAL_MEM_FENCE); | |
44: for (int i = 0; i < numFilterCopyLoops; i++) { | |
45: const int offset = i * gOutputSize + outRow; | |
46: bool process = filterId < gNumFilters && offset < gFilterSizeSquared; | |
47: if (process) { | |
48: _localFilterPlane[ offset ] = globalFilterPlane[ offset ]; | |
49: } | |
50: } | |
51: // loop over n ... | |
52: for (int n = 0; n < batchSize; n++) { | |
53: // copy down our imageplane, we have workgroupSize threads to do this | |
54: barrier(CLK_LOCAL_MEM_FENCE); | |
55: global float const *globalImagePlane = images + | |
56: (n * gNumInputPlanes + inputPlaneId) * gInputSizeSquared; | |
57: for (int i = 0; i< numImageCopyLoops; i++) { | |
58: const int offset = i * workgroupSize + localId; | |
59: if (offset < gInputSizeSquared) { | |
60: _inputPlane[ offset ] = globalImagePlane[ offset ]; | |
61: } | |
62: } | |
63: barrier(CLK_LOCAL_MEM_FENCE); | |
64: // calc output for each [outrow][outcol] | |
65: bool filterPlaneOk = filterId < gNumFilters; | |
66: for (int outCol = 0; outCol < gOutputSize; outCol++) { | |
67: float sum = 0; | |
68: for (int filterRow = 0; filterRow < gFilterSize; filterRow++) { | |
69: int inRow = outRow + filterRow; | |
70: #if gPadZeros == 1 | |
71: inRow -= gHalfFilterSize; | |
72: #endif | |
73: bool rowOk = filterPlaneOk && inRow >= 0 && inRow < gInputSize; | |
74: for (int filterCol = 0; filterCol < gFilterSize; filterCol++) { | |
75: int inCol = outCol + filterCol; | |
76: #if gPadZeros == 1 | |
77: inCol -= gHalfFilterSize; | |
78: #endif | |
79: bool process = rowOk && inCol >= 0 && inCol < gInputSize; | |
80: if (process) { | |
81: float imageValue = _inputPlane[ inRow * gInputSize + inCol ]; | |
82: float filterValue = _localFilterPlane[ filterRow * gFilterSize + filterCol ]; | |
83: sum += imageValue * filterValue; | |
84: } | |
85: } | |
86: } | |
87: if (filterId < gNumFilters) { | |
88: // [n][filterId][outRow][outCol][inputPlane] | |
89: int resultIndex = (( (n | |
90: * gNumFilters + filterId) | |
91: * gOutputSize + outRow) | |
92: * gOutputSize + outCol) | |
93: * gNumInputPlanes + inputPlaneId; | |
94: output[resultIndex] = sum; | |
95: //if (globalId == 2) output[0] = resultIndex; | |
96: // output[resultIndex] = outRow; | |
97: } | |
98: // output[localId] = _localFilterPlane[localId]; | |
99: } | |
100: } | |
101: } | |
102: } | |
103: | |
104: | |
Out of resources, code -5 | |
forward try kernel 7 | |
... seems valid | |
drm_intel_gem_bo_context_exec() failed: Input/output error | |
ForwardAuto: kernel 7 this instance cant be used: | |
kernel source: | |
1: // from SpatialConvolutionMM.cu: | |
2: | |
3: // CL: grid stride looping | |
4: #define CL_KERNEL_LOOP(i, n) \ | |
5: for (int i = get_group_id(0) * get_local_size(0) + get_local_id(0); \ | |
6: i < (n); \ | |
7: i += get_local_size(0) * get_num_groups(0)) | |
8: | |
9: //#define gPadding 0 | |
10: //#define gStride 1 | |
11: //#define gColSize 1 | |
12: //#define gFilterSize 1 | |
13: //#define gSize 1 | |
14: | |
15: // Kernel for fast unfold+copy | |
16: // (adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu) | |
17: kernel void im2col( | |
18: const int n, | |
19: global float const * im_data, int im_offset, | |
20: global float* data_col) { | |
21: global const float *data_im = im_data + im_offset; | |
22: | |
23: CL_KERNEL_LOOP(index, n) { | |
24: int w_out = index % 1; | |
25: index /= 1; | |
26: int h_out = index % 1; | |
27: int channel_in = index / 1; | |
28: int channel_out = channel_in * 1 * 1; | |
29: int h_in = h_out * 1 - 0; | |
30: int w_in = w_out * 1 - 0; | |
31: data_col += (channel_out * 1 + h_out) * 1 + w_out; | |
32: data_im += (channel_in * 1 + h_in) * 1 + w_in; | |
33: for (int i = 0; i < 1; ++i) { | |
34: for (int j = 0; j < 1; ++j) { | |
35: int h = h_in + i; | |
36: int w = w_in + j; | |
37: *data_col = (h >= 0 && w >= 0 && h < 1 && w < 1) ? | |
38: data_im[i * 1 + j] : 0; | |
39: data_col += 1 * 1; | |
40: } | |
41: } | |
42: } | |
43: } | |
44: | |
45: kernel void col2im( | |
46: const int n, | |
47: global float const *data_col, | |
48: global float* im_data, int im_offset) { | |
49: global float *data_im = im_data + im_offset; | |
50: | |
51: for (int index = get_group_id(0) * get_local_size(0) + get_local_id(0); index < (n); index += get_local_size(0) * get_num_groups(0)) { | |
52: float val = 0; | |
53: int w = index % 1 + 0; | |
54: int h = (index / 1) % 1 + 0; | |
55: int c = index / (1 * 1); | |
56: // compute the start and end of the output | |
57: int w_col_start = (w < 1) ? 0 : (w - 1) / 1 + 1; | |
58: int w_col_end = min(w / 1 + 1, 1); | |
59: int h_col_start = (h < 1) ? 0 : (h - 1) / 1 + 1; | |
60: int h_col_end = min(h / 1 + 1, 1); | |
61: | |
62: int offset = (c * 1 * 1 + h * 1 + w) * 1 * 1; | |
63: int coeff_h_col = (1 - 1 * 1 * 1) * 1; | |
64: int coeff_w_col = (1 - 1 * 1 * 1); | |
65: for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { | |
66: for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { | |
67: val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; | |
68: } | |
69: } | |
70: data_im[index] = val; | |
71: } | |
72: } | |
73: | |
74: | |
Out of resources, code -5 | |
forward kernel 0: cannot be used | |
forward kernel 1 time: 0ms | |
forward kernel 2: cannot be used | |
forward kernel 3: cannot be used | |
forward kernel 4: cannot be used | |
forward kernel 5: cannot be used | |
forward kernel 6: cannot be used | |
forward kernel 7: cannot be used | |
forward layer selected kernel 1 | |
drm_intel_gem_bo_context_exec() failed: Input/output error | |
terminate called after throwing an instance of 'std::runtime_error' | |
what(): | |
kernel source: | |
1: // Copyright Hugh Perkins 2014, 2015 hughperkins at gmail | |
2: // | |
3: // This Source Code Form is subject to the terms of the Mozilla Public License, | |
4: // v. 2.0. If a copy of the MPL was not distributed with this file, You can | |
5: // obtain one at http://mozilla.org/MPL/2.0/. | |
6: | |
7: // notes on non-odd filtersizes: | |
8: // for odd, imagesize and filtersize 3, padZeros = 0: | |
9: // output is a single square | |
10: // m and n should vary between -1,0,1 | |
11: // for even, imagesize and filtersize 2, padzeros = 0 | |
12: // output is a single square, which we can position at topleft or bottomrigth | |
13: // lets position it in bottomright | |
14: // then m and n should vary as -1,0 | |
15: // | |
16: // for even, imagesize and filtersize 2, padzeros = 1 | |
17: // output is 2 by 2 | |
18: // well... if it is even: | |
19: // - if we are not padding zeros, then we simply move our filter around the image somehow | |
20: // - if we are padding zeros, then we conceptually pad the bottom and right edge of the image with zeros by 1 | |
21: // filtersize remains the same | |
22: // m will vary as -1,0,1 | |
23: // outputrow is fixed by globalid | |
24: // inputrow should be unchanged... | |
25: // padzeros = 0: | |
26: // x x . . . . | |
27: // x x . . x x | |
28: // . . . . x x | |
29: // when filtersize even: | |
30: // new imagesize = oldimagesize - filtersize + 1 | |
31: // when filtersize odd: | |
32: // x x x . | |
33: // x x x . | |
34: // x x x . | |
35: // . . . . | |
36: // new imagesize = oldimagesize - filtersize + 1 | |
37: // padzeros = 1: | |
38: // x x | |
39: // x x . . x x . . . . . . . | |
40: // . . . x x . . x x . . . | |
41: // . . . . . . . x x . . x x | |
42: // outrow=0 outrow=1 outrow=2 x x | |
43: // outcol=0 outcol=1 outcol=2 outrow=3 | |
44: // outcol=3 | |
45: // when filtersize is even, and padzeros, imagesize grows by 1 each time... | |
46: // imagesize = oldimagesize + 1 | |
47: // when filtersize is odd | |
48: // x x x | |
49: // x x x . x x x . . . | |
50: // x x x . x x x . x x x | |
51: // . . . x x x . x x x | |
52: // x x x | |
53: | |
54: // images are organized like [imageId][plane][row][col] | |
55: // filters are organized like [filterid][inplane][filterrow][filtercol] | |
56: // output are organized like [imageid][filterid][row][col] | |
57: // global id is organized like output, ie: [imageid][outplane][outrow][outcol] | |
58: // - no local memory used currently | |
59: // - each thread: | |
60: // - loads a whole upstream cube | |
61: // - loads a whole filter cube | |
62: // - writes one output... | |
63: void kernel convolve_imagecubes_float2( | |
64: const int numExamples, | |
65: global const float *inputs, global const float *filters, | |
66: global float *output) { | |
67: int globalId = get_global_id(0); | |
68: | |
69: int outputImage2Id = globalId / gOutputSizeSquared; | |
70: int exampleId = outputImage2Id / gNumFilters; | |
71: int filterId = outputImage2Id % gNumFilters; | |
72: | |
73: // intraimage coords | |
74: int localid = globalId % gOutputSizeSquared; | |
75: int outputRow = localid / gOutputSize; | |
76: int outputCol = localid % gOutputSize; | |
77: | |
78: global float const*inputCube = inputs + exampleId * gNumInputPlanes * gInputSizeSquared; | |
79: global float const*filterCube = filters + filterId * gNumInputPlanes * gFilterSizeSquared; | |
80: | |
81: float sum = 0; | |
82: if (exampleId < numExamples) { | |
83: for (int inputPlaneIdx = 0; inputPlaneIdx < gNumInputPlanes; inputPlaneIdx++) { | |
84: global float const*inputPlane = inputCube + inputPlaneIdx * gInputSizeSquared; | |
85: global float const*filterPlane = filterCube + inputPlaneIdx * gFilterSizeSquared; | |
86: for (int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++) { | |
87: // trying to reduce register pressure... | |
88: #if gPadZeros == 1 | |
89: #define inputRowIdx (outputRow + u) | |
90: #else | |
91: #define inputRowIdx (outputRow + u + gHalfFilterSize) | |
92: #endif | |
93: global float const *inputRow = inputPlane + inputRowIdx * gInputSize; | |
94: global float const *filterRow = filterPlane + (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize; | |
95: bool rowOk = inputRowIdx >= 0 && inputRowIdx < gInputSize; | |
96: #pragma unroll | |
97: for (int v = -gHalfFilterSize; v <= gHalfFilterSize - gEven; v++) { | |
98: #if gPadZeros == 1 | |
99: #define inputColIdx (outputCol + v) | |
100: #else | |
101: #define inputColIdx (outputCol + v + gHalfFilterSize) | |
102: #endif | |
103: bool process = rowOk && inputColIdx >= 0 && inputColIdx < gInputSize; | |
104: if (process) { | |
105: sum += inputRow[inputColIdx] * filterRow[v]; | |
106: } | |
107: } | |
108: } | |
109: } | |
110: } | |
111: | |
112: if (exampleId < numExamples) { | |
113: output[globalId] = sum; | |
114: } | |
115: } | |
116: | |
117: | |
Out of resources, code -5 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment