Skip to content

Instantly share code, notes, and snippets.

@ShigekiKarita
Last active November 15, 2021 23:28
Show Gist options
  • Save ShigekiKarita/edcab9d3797ff7633b73 to your computer and use it in GitHub Desktop.
Save ShigekiKarita/edcab9d3797ff7633b73 to your computer and use it in GitHub Desktop.
Convolutional-Pooling Computing in OpenCL
#ifndef OpenCLTest_clutils_hpp
#define OpenCLTest_clutils_hpp
#include "cl.hpp"
#include <type_traits>
#define __CL_ENABLE_EXCEPTIONS
/* OpenCL utilities */
struct CLS // as singleton
{
CLS()
{
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
ctx = cl::Context(devices);
std::string bitcode_path = "OpenCL/kernel.cl.gpu_32.bc";
program = cl::Program(ctx, devices, { {bitcode_path.c_str(), bitcode_path.length() }});
program.build(devices);
queue = cl::CommandQueue(ctx, devices[0]);
}
public:
static const CLS& get_instance()
{
static CLS cls;
return cls;
}
std::vector<cl::Device> devices;
cl::Context ctx;
cl::Program program;
cl::CommandQueue queue;
};
inline void iter_kernel_args(cl::Kernel &k,int i) {} // end of reccursion
template<class First, class... Rest>
inline void iter_kernel_args(cl::Kernel &kernel, int i, const First &first, const Rest& ...rest)
{
kernel.setArg(i, first);
iter_kernel_args(kernel, i+1, rest...); // reccur
}
template<class... Args>
cl::Kernel kernel_args(const std::string& name, const Args& ...args)
{
cl::Kernel kernel(CLS::get_instance().program, name.c_str());
iter_kernel_args(kernel, 0, args...);
return kernel;
}
template <class T, typename std::enable_if<std::is_fundamental<T>::value>::type* = nullptr>
cl::Buffer copy_to_buffer(T& input)
{
return cl::Buffer
(CLS::get_instance().ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(T), &input);
}
template <class T, typename std::enable_if<std::is_class<T>::value>::type* = nullptr>
cl::Buffer copy_to_buffer(T& input)
{
return cl::Buffer
(CLS::get_instance().ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
sizeof(*input.data()) * input.size(), input.data());
}
template <class T, typename std::enable_if<std::is_fundamental<T>::value>::type* = nullptr>
void copy_from_buffer(const cl::Buffer& src, T& dst)
{
CLS::get_instance().queue.enqueueReadBuffer
(src, CL_TRUE, 0, sizeof(T), &dst);
}
template <class T, typename std::enable_if<std::is_class<T>::value>::type* = nullptr>
void copy_from_buffer(const cl::Buffer& src, T& dst)
{
CLS::get_instance().queue.enqueueReadBuffer
(src, CL_TRUE, 0, sizeof(*dst.data()) * dst.size(), dst.data());
}
#endif
__kernel void add_matrix_conv(const __global float* A, __global float* B, const int lda, const int ldb,
const int output_size, const int input_size, const int filter_size, const int b)
{
const int r = get_global_id(0);
if (r >= output_size) return;
const int c = get_global_id(1);
if (c >= input_size * filter_size) return;
const __global float* A_iter = A + b * (output_size + input_size * lda);
B[r + ldb * c] += A_iter[r + lda * c];
}
__kernel void max_pool(const __global float* input, __global int* output,
const int input_rows, const int out_rows, const int cols,
const int pool, const int stride)
{
const int i = get_global_id(0);
if (i >= input_rows) return;
const int j = get_global_id(1);
if (j >= cols) return;
float max = input[i + input_rows * j];
int max_coeff = i + input_rows * j;
for (int p = 1; p < pool; ++p)
{
float m = input[i + stride * p + input_rows * j];
if (m > max)
{
max = m;
max_coeff = i + stride * p + input_rows * j;
}
}
output[i + out_rows * j] = max_coeff;
}
__kernel void back_max_pool(const __global float* input, __global float* output,
const int input_rows, const int out_rows, const int cols,
const __global int* indices)
{
const int i = get_global_id(0);
if (i >= input_rows) return;
const int j = get_global_id(1);
if (j >= cols) return;
output[indices[i + input_rows * j]] += input[i + input_rows * j];
}
#include <Eigen/Dense>
#include "gtest/gtest.h"
#include "clutils.hpp"
typedef Eigen::MatrixXf fmatrix;
TEST(CNN, PropConv)
{
// my alternative of CLDriver (clutils.hpp)
const CLS &cls = CLS::get_instance();
const std::size_t output_size = 80;
const std::size_t input_size = 33;
const std::size_t filter_size = 5;
const std::size_t band_size = 24;
// delta of Weights
fmatrix W = fmatrix::Zero(output_size, input_size * filter_size);
fmatrix W_conv = fmatrix::Ones(output_size * band_size, input_size * (band_size + filter_size - 1));
// NoCL operations
fmatrix result_nocl = W;
for (std::size_t b = 0; b < band_size; ++b)
{
result_nocl += W_conv.block(b * output_size, b * input_size, W.rows(), W.cols());
}
// CL operations
cl::Buffer W_conv_buf = copy_to_buffer(W_conv);
cl::Buffer W_buf = copy_to_buffer(W);
for (std::size_t b = 0; b < band_size; ++b)
{
// my alternative of BIND macro (clutils.hpp)
cl::Kernel kernel = kernel_args("add_matrix_conv",
W_conv_buf, W_buf, W_conv.rows(), W.rows(),
output_size, input_size, filter_size, b);
cl::NDRange global(W.rows(), W.cols());
cls.queue.enqueueNDRangeKernel(kernel, cl::NullRange, global);
}
// NoCL vs. CL comparison
fmatrix result_cl(W.rows(), W.cols());
copy_from_buffer(W_buf, result_cl);
for (std::size_t r = 0; r < result_nocl.rows(); ++r)
{
for (std::size_t c = 0; c < result_nocl.cols(); ++c)
{
ASSERT_NEAR(result_nocl(r, c), result_cl(r, c), 0.001);
}
}
}
std::vector<std::size_t> maxpool(const fmatrix& input, const std::size_t pool, const std::size_t stride)
{
const std::size_t out_rows = (input.rows() / stride - pool + 1) * stride;
const std::size_t out_cols = input.cols();
std::vector<std::size_t> ret(out_rows * out_cols);
for (std::size_t i = 0; i < out_rows; ++i)
{
for (std::size_t j = 0; j < out_cols; ++j)
{
float max = input(i, j);
std::size_t max_coeff = i + input.rows() * j;
for (std::size_t k = 1; k < pool; ++k) // && i + stride * k < input.rows()
{
float m = input(i + stride * k, j);
if (m > max)
{
max = m;
max_coeff = i + stride * k + input.rows() * j;
}
}
ret[i + out_rows * j] = max_coeff;
}
}
return ret;
}
fmatrix back_maxpool(const fmatrix& input, const std::vector<std::size_t>& indices, const std::size_t output)
{
fmatrix ret = fmatrix::Zero(output, input.cols());
for (std::size_t i = 0; i < input.rows(); ++i)
{
for (std::size_t j = 0; j < input.cols(); ++j)
{
*(ret.data() + indices[i + input.rows() * j]) += input(i, j);
}
}
return ret;
}
TEST(CNN, PropPool)
{
const std::size_t pool = 2;
const std::size_t stride = 2;
const std::size_t input = 6; // NOTE: this means W_conv.rows(), output from convolution ply
const std::size_t output = (input / stride - pool + 1) * stride; // no overlap
const std::size_t batch = 2;
ASSERT_EQ(4, output);
// fold horizontaly in stride, seek verticaly in pool
fmatrix in0(input,1);
in0 <<
1,2,
3,2,
1,4;
fmatrix out0(output,1);
out0 <<
3,2,
3,4;
fmatrix in1(input,1);
in1 <<
2,1,
0,1,
3,0;
fmatrix out1(output,1);
out1 <<
2,1,
3,1;
fmatrix in_mat(input, batch);
in_mat << in0, in1;
fmatrix out_mat(output, batch);
out_mat << out0, out1;
std::vector<std::size_t> expected_indices =
{2, 1, 2, 5, // in0 -> out0
0 + input, 1 + input, 4 + input, 3 + input}; // in1 -> out1
// NoCL operations
const auto result_nocl = maxpool(in_mat, pool, stride);
for (std::size_t i = 0; i < expected_indices.size(); ++i)
{
ASSERT_EQ(expected_indices[i], result_nocl[i]);
}
}
TEST(CNN, PropPoolRandom)
{
const std::size_t pool = 5;
const std::size_t stride = 33;
const std::size_t input = 24 * 33; // NOTE: this means W_conv.rows(), output from convolution ply
const std::size_t output = (input / stride - pool + 1) * stride; // no overlap
const std::size_t batch = 128;
fmatrix in_mat = fmatrix::Random(input, batch);
fmatrix out_mat = fmatrix::Random(output, batch);
// NoCL operations
const auto result_nocl = maxpool(in_mat, pool, stride);
// CL operations
cl::Buffer in_buf = copy_to_buffer(in_mat);
std::vector<int> result_cl(output * batch);
cl::Buffer out_buf = copy_to_buffer(result_cl); // don't care about elems
cl::Kernel kernel = kernel_args("max_pool", in_buf, out_buf, input, output, batch, pool, stride);
cl::NDRange global(output, batch);
CLS::get_instance().queue.enqueueNDRangeKernel(kernel, cl::NullRange, global);
copy_from_buffer(out_buf, result_cl);
for (std::size_t i = 0; i < result_nocl.size(); ++i)
{
ASSERT_EQ(result_nocl[i], result_cl[i]);
}
}
int main(int argc, const char * argv[])
{
::testing::InitGoogleTest(&argc, (char **)argv);
return RUN_ALL_TESTS();
}
[==========] Running 3 tests from 1 test case.
[----------] Global test environment set-up.
[----------] 3 tests from CNN
[ RUN ] CNN.PropConv
[ OK ] CNN.PropConv (59 ms)
[ RUN ] CNN.PropPool
[ OK ] CNN.PropPool (0 ms)
[ RUN ] CNN.PropPoolRandom
[ OK ] CNN.PropPoolRandom (4 ms)
[----------] 3 tests from CNN (63 ms total)
[----------] Global test environment tear-down
[==========] 3 tests from 1 test case ran. (63 ms total)
[ PASSED ] 3 tests.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment