Last active
November 15, 2021 23:28
-
-
Save ShigekiKarita/edcab9d3797ff7633b73 to your computer and use it in GitHub Desktop.
Convolutional-Pooling Computing in OpenCL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef OpenCLTest_clutils_hpp | |
#define OpenCLTest_clutils_hpp | |
#include "cl.hpp" | |
#include <type_traits> | |
#define __CL_ENABLE_EXCEPTIONS | |
/* OpenCL utilities */ | |
struct CLS // as singleton | |
{ | |
CLS() | |
{ | |
std::vector<cl::Platform> platforms; | |
cl::Platform::get(&platforms); | |
platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices); | |
ctx = cl::Context(devices); | |
std::string bitcode_path = "OpenCL/kernel.cl.gpu_32.bc"; | |
program = cl::Program(ctx, devices, { {bitcode_path.c_str(), bitcode_path.length() }}); | |
program.build(devices); | |
queue = cl::CommandQueue(ctx, devices[0]); | |
} | |
public: | |
static const CLS& get_instance() | |
{ | |
static CLS cls; | |
return cls; | |
} | |
std::vector<cl::Device> devices; | |
cl::Context ctx; | |
cl::Program program; | |
cl::CommandQueue queue; | |
}; | |
inline void iter_kernel_args(cl::Kernel &k,int i) {} // end of reccursion | |
template<class First, class... Rest> | |
inline void iter_kernel_args(cl::Kernel &kernel, int i, const First &first, const Rest& ...rest) | |
{ | |
kernel.setArg(i, first); | |
iter_kernel_args(kernel, i+1, rest...); // reccur | |
} | |
template<class... Args> | |
cl::Kernel kernel_args(const std::string& name, const Args& ...args) | |
{ | |
cl::Kernel kernel(CLS::get_instance().program, name.c_str()); | |
iter_kernel_args(kernel, 0, args...); | |
return kernel; | |
} | |
template <class T, typename std::enable_if<std::is_fundamental<T>::value>::type* = nullptr> | |
cl::Buffer copy_to_buffer(T& input) | |
{ | |
return cl::Buffer | |
(CLS::get_instance().ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(T), &input); | |
} | |
template <class T, typename std::enable_if<std::is_class<T>::value>::type* = nullptr> | |
cl::Buffer copy_to_buffer(T& input) | |
{ | |
return cl::Buffer | |
(CLS::get_instance().ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, | |
sizeof(*input.data()) * input.size(), input.data()); | |
} | |
template <class T, typename std::enable_if<std::is_fundamental<T>::value>::type* = nullptr> | |
void copy_from_buffer(const cl::Buffer& src, T& dst) | |
{ | |
CLS::get_instance().queue.enqueueReadBuffer | |
(src, CL_TRUE, 0, sizeof(T), &dst); | |
} | |
template <class T, typename std::enable_if<std::is_class<T>::value>::type* = nullptr> | |
void copy_from_buffer(const cl::Buffer& src, T& dst) | |
{ | |
CLS::get_instance().queue.enqueueReadBuffer | |
(src, CL_TRUE, 0, sizeof(*dst.data()) * dst.size(), dst.data()); | |
} | |
#endif |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__kernel void add_matrix_conv(const __global float* A, __global float* B, const int lda, const int ldb, | |
const int output_size, const int input_size, const int filter_size, const int b) | |
{ | |
const int r = get_global_id(0); | |
if (r >= output_size) return; | |
const int c = get_global_id(1); | |
if (c >= input_size * filter_size) return; | |
const __global float* A_iter = A + b * (output_size + input_size * lda); | |
B[r + ldb * c] += A_iter[r + lda * c]; | |
} | |
__kernel void max_pool(const __global float* input, __global int* output, | |
const int input_rows, const int out_rows, const int cols, | |
const int pool, const int stride) | |
{ | |
const int i = get_global_id(0); | |
if (i >= input_rows) return; | |
const int j = get_global_id(1); | |
if (j >= cols) return; | |
float max = input[i + input_rows * j]; | |
int max_coeff = i + input_rows * j; | |
for (int p = 1; p < pool; ++p) | |
{ | |
float m = input[i + stride * p + input_rows * j]; | |
if (m > max) | |
{ | |
max = m; | |
max_coeff = i + stride * p + input_rows * j; | |
} | |
} | |
output[i + out_rows * j] = max_coeff; | |
} | |
__kernel void back_max_pool(const __global float* input, __global float* output, | |
const int input_rows, const int out_rows, const int cols, | |
const __global int* indices) | |
{ | |
const int i = get_global_id(0); | |
if (i >= input_rows) return; | |
const int j = get_global_id(1); | |
if (j >= cols) return; | |
output[indices[i + input_rows * j]] += input[i + input_rows * j]; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <Eigen/Dense> | |
#include "gtest/gtest.h" | |
#include "clutils.hpp" | |
typedef Eigen::MatrixXf fmatrix; | |
TEST(CNN, PropConv) | |
{ | |
// my alternative of CLDriver (clutils.hpp) | |
const CLS &cls = CLS::get_instance(); | |
const std::size_t output_size = 80; | |
const std::size_t input_size = 33; | |
const std::size_t filter_size = 5; | |
const std::size_t band_size = 24; | |
// delta of Weights | |
fmatrix W = fmatrix::Zero(output_size, input_size * filter_size); | |
fmatrix W_conv = fmatrix::Ones(output_size * band_size, input_size * (band_size + filter_size - 1)); | |
// NoCL operations | |
fmatrix result_nocl = W; | |
for (std::size_t b = 0; b < band_size; ++b) | |
{ | |
result_nocl += W_conv.block(b * output_size, b * input_size, W.rows(), W.cols()); | |
} | |
// CL operations | |
cl::Buffer W_conv_buf = copy_to_buffer(W_conv); | |
cl::Buffer W_buf = copy_to_buffer(W); | |
for (std::size_t b = 0; b < band_size; ++b) | |
{ | |
// my alternative of BIND macro (clutils.hpp) | |
cl::Kernel kernel = kernel_args("add_matrix_conv", | |
W_conv_buf, W_buf, W_conv.rows(), W.rows(), | |
output_size, input_size, filter_size, b); | |
cl::NDRange global(W.rows(), W.cols()); | |
cls.queue.enqueueNDRangeKernel(kernel, cl::NullRange, global); | |
} | |
// NoCL vs. CL comparison | |
fmatrix result_cl(W.rows(), W.cols()); | |
copy_from_buffer(W_buf, result_cl); | |
for (std::size_t r = 0; r < result_nocl.rows(); ++r) | |
{ | |
for (std::size_t c = 0; c < result_nocl.cols(); ++c) | |
{ | |
ASSERT_NEAR(result_nocl(r, c), result_cl(r, c), 0.001); | |
} | |
} | |
} | |
std::vector<std::size_t> maxpool(const fmatrix& input, const std::size_t pool, const std::size_t stride) | |
{ | |
const std::size_t out_rows = (input.rows() / stride - pool + 1) * stride; | |
const std::size_t out_cols = input.cols(); | |
std::vector<std::size_t> ret(out_rows * out_cols); | |
for (std::size_t i = 0; i < out_rows; ++i) | |
{ | |
for (std::size_t j = 0; j < out_cols; ++j) | |
{ | |
float max = input(i, j); | |
std::size_t max_coeff = i + input.rows() * j; | |
for (std::size_t k = 1; k < pool; ++k) // && i + stride * k < input.rows() | |
{ | |
float m = input(i + stride * k, j); | |
if (m > max) | |
{ | |
max = m; | |
max_coeff = i + stride * k + input.rows() * j; | |
} | |
} | |
ret[i + out_rows * j] = max_coeff; | |
} | |
} | |
return ret; | |
} | |
fmatrix back_maxpool(const fmatrix& input, const std::vector<std::size_t>& indices, const std::size_t output) | |
{ | |
fmatrix ret = fmatrix::Zero(output, input.cols()); | |
for (std::size_t i = 0; i < input.rows(); ++i) | |
{ | |
for (std::size_t j = 0; j < input.cols(); ++j) | |
{ | |
*(ret.data() + indices[i + input.rows() * j]) += input(i, j); | |
} | |
} | |
return ret; | |
} | |
TEST(CNN, PropPool) | |
{ | |
const std::size_t pool = 2; | |
const std::size_t stride = 2; | |
const std::size_t input = 6; // NOTE: this means W_conv.rows(), output from convolution ply | |
const std::size_t output = (input / stride - pool + 1) * stride; // no overlap | |
const std::size_t batch = 2; | |
ASSERT_EQ(4, output); | |
// fold horizontaly in stride, seek verticaly in pool | |
fmatrix in0(input,1); | |
in0 << | |
1,2, | |
3,2, | |
1,4; | |
fmatrix out0(output,1); | |
out0 << | |
3,2, | |
3,4; | |
fmatrix in1(input,1); | |
in1 << | |
2,1, | |
0,1, | |
3,0; | |
fmatrix out1(output,1); | |
out1 << | |
2,1, | |
3,1; | |
fmatrix in_mat(input, batch); | |
in_mat << in0, in1; | |
fmatrix out_mat(output, batch); | |
out_mat << out0, out1; | |
std::vector<std::size_t> expected_indices = | |
{2, 1, 2, 5, // in0 -> out0 | |
0 + input, 1 + input, 4 + input, 3 + input}; // in1 -> out1 | |
// NoCL operations | |
const auto result_nocl = maxpool(in_mat, pool, stride); | |
for (std::size_t i = 0; i < expected_indices.size(); ++i) | |
{ | |
ASSERT_EQ(expected_indices[i], result_nocl[i]); | |
} | |
} | |
TEST(CNN, PropPoolRandom) | |
{ | |
const std::size_t pool = 5; | |
const std::size_t stride = 33; | |
const std::size_t input = 24 * 33; // NOTE: this means W_conv.rows(), output from convolution ply | |
const std::size_t output = (input / stride - pool + 1) * stride; // no overlap | |
const std::size_t batch = 128; | |
fmatrix in_mat = fmatrix::Random(input, batch); | |
fmatrix out_mat = fmatrix::Random(output, batch); | |
// NoCL operations | |
const auto result_nocl = maxpool(in_mat, pool, stride); | |
// CL operations | |
cl::Buffer in_buf = copy_to_buffer(in_mat); | |
std::vector<int> result_cl(output * batch); | |
cl::Buffer out_buf = copy_to_buffer(result_cl); // don't care about elems | |
cl::Kernel kernel = kernel_args("max_pool", in_buf, out_buf, input, output, batch, pool, stride); | |
cl::NDRange global(output, batch); | |
CLS::get_instance().queue.enqueueNDRangeKernel(kernel, cl::NullRange, global); | |
copy_from_buffer(out_buf, result_cl); | |
for (std::size_t i = 0; i < result_nocl.size(); ++i) | |
{ | |
ASSERT_EQ(result_nocl[i], result_cl[i]); | |
} | |
} | |
int main(int argc, const char * argv[]) | |
{ | |
::testing::InitGoogleTest(&argc, (char **)argv); | |
return RUN_ALL_TESTS(); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[==========] Running 3 tests from 1 test case. | |
[----------] Global test environment set-up. | |
[----------] 3 tests from CNN | |
[ RUN ] CNN.PropConv | |
[ OK ] CNN.PropConv (59 ms) | |
[ RUN ] CNN.PropPool | |
[ OK ] CNN.PropPool (0 ms) | |
[ RUN ] CNN.PropPoolRandom | |
[ OK ] CNN.PropPoolRandom (4 ms) | |
[----------] 3 tests from CNN (63 ms total) | |
[----------] Global test environment tear-down | |
[==========] 3 tests from 1 test case ran. (63 ms total) | |
[ PASSED ] 3 tests. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment