ShigekiKarita · November 15, 2021 23:28
diff --git a/clutils.hpp b/clutils.hpp
 #ifndef OpenCLTest_clutils_hpp
 #define OpenCLTest_clutils_hpp


 #include "cl.hpp"
 #include <type_traits>


 #define __CL_ENABLE_EXCEPTIONS

 /* OpenCL utilities */

 struct CLS // as singleton
 {
    CLS()
    {
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);
        
        platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
        ctx = cl::Context(devices);
        
        std::string bitcode_path = "OpenCL/kernel.cl.gpu_32.bc";
        program = cl::Program(ctx, devices, { {bitcode_path.c_str(), bitcode_path.length() }});
        program.build(devices);
        
        queue = cl::CommandQueue(ctx, devices[0]);
    }
    
 public:
    static const CLS& get_instance()
    {
        static CLS cls;
        return cls;
    }
    
    std::vector<cl::Device> devices;
    cl::Context ctx;
    cl::Program program;
    cl::CommandQueue queue;
 };


 inline void iter_kernel_args(cl::Kernel &k,int i) {} // end of reccursion

 template<class First, class... Rest>
 inline void iter_kernel_args(cl::Kernel &kernel, int i, const First &first, const Rest& ...rest)
 {
    kernel.setArg(i, first);
    iter_kernel_args(kernel, i+1, rest...); // reccur
 }

 template<class... Args>
 cl::Kernel kernel_args(const std::string& name, const Args& ...args)
 {
    cl::Kernel kernel(CLS::get_instance().program, name.c_str());
    iter_kernel_args(kernel, 0, args...);
    
    return kernel;
 }

 template <class T, typename std::enable_if<std::is_fundamental<T>::value>::type* = nullptr>
 cl::Buffer copy_to_buffer(T& input)
 {
    return cl::Buffer
    (CLS::get_instance().ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(T), &input);
 }

 template <class T, typename std::enable_if<std::is_class<T>::value>::type* = nullptr>
 cl::Buffer copy_to_buffer(T& input)
 {
    return cl::Buffer
    (CLS::get_instance().ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
     sizeof(*input.data()) * input.size(), input.data());
 }

 template <class T, typename std::enable_if<std::is_fundamental<T>::value>::type* = nullptr>
 void copy_from_buffer(const cl::Buffer& src, T& dst)
 {
    CLS::get_instance().queue.enqueueReadBuffer
    (src, CL_TRUE, 0, sizeof(T), &dst);
 }

 template <class T, typename std::enable_if<std::is_class<T>::value>::type* = nullptr>
 void copy_from_buffer(const cl::Buffer& src, T& dst)
 {
    CLS::get_instance().queue.enqueueReadBuffer
    (src, CL_TRUE, 0, sizeof(*dst.data()) * dst.size(), dst.data());
 }


 #endif
diff --git a/kernel.cl b/kernel.cl
 __kernel void add_matrix_conv(const __global float* A, __global float* B, const int lda, const int ldb,
                              const int output_size, const int input_size, const int filter_size, const int b)
 {
    const int r = get_global_id(0);
    if (r >= output_size) return;
    const int c = get_global_id(1);
    if (c >= input_size * filter_size) return;

    
    const __global float* A_iter = A + b * (output_size + input_size * lda);
    B[r + ldb * c] += A_iter[r + lda * c];
 }

 __kernel void max_pool(const __global float* input, __global int* output,
                       const int input_rows, const int out_rows, const int cols,
                       const int pool, const int stride)
 {
    const int i = get_global_id(0);
    if (i >= input_rows) return;
    const int j = get_global_id(1);
    if (j >= cols) return;
    
    float max = input[i + input_rows * j];
    int max_coeff = i + input_rows * j;

    for (int p = 1; p < pool; ++p)
    {
        float m = input[i + stride * p + input_rows * j];
        
        if (m > max)
        {
            max = m;
            max_coeff = i + stride * p + input_rows * j;
        }
    }
    output[i + out_rows * j] = max_coeff;
 }

 __kernel void back_max_pool(const __global float* input, __global float* output,
                            const int input_rows, const int out_rows, const int cols,
                            const __global int* indices)
 {
    const int i = get_global_id(0);
    if (i >= input_rows) return;
    const int j = get_global_id(1);
    if (j >= cols) return;
    
    output[indices[i + input_rows * j]] += input[i + input_rows * j];
 }
diff --git a/main.cpp b/main.cpp
 #include <Eigen/Dense>
 #include "gtest/gtest.h"

 #include "clutils.hpp"


 typedef Eigen::MatrixXf fmatrix;


 TEST(CNN, PropConv)
 {
    // my alternative of CLDriver (clutils.hpp)
    const CLS &cls = CLS::get_instance();

    const std::size_t output_size = 80;
    const std::size_t input_size = 33;
    const std::size_t filter_size = 5;
    const std::size_t band_size = 24;
    
    // delta of Weights
    fmatrix W = fmatrix::Zero(output_size, input_size * filter_size);
    fmatrix W_conv = fmatrix::Ones(output_size * band_size, input_size * (band_size + filter_size - 1));
    
    
    // NoCL operations
    fmatrix result_nocl = W;
    for (std::size_t b = 0; b < band_size; ++b)
    {
        result_nocl += W_conv.block(b * output_size, b * input_size, W.rows(), W.cols());
    }
    

    // CL operations
    cl::Buffer W_conv_buf = copy_to_buffer(W_conv);
    cl::Buffer W_buf = copy_to_buffer(W);

    
    for (std::size_t b = 0; b < band_size; ++b)
    {
        // my alternative of BIND macro (clutils.hpp)
        cl::Kernel kernel = kernel_args("add_matrix_conv",
                                        W_conv_buf, W_buf, W_conv.rows(), W.rows(),
                                        output_size, input_size, filter_size, b);
        cl::NDRange global(W.rows(), W.cols());
        cls.queue.enqueueNDRangeKernel(kernel, cl::NullRange, global);
    }

    // NoCL vs. CL comparison
    fmatrix result_cl(W.rows(), W.cols());
    copy_from_buffer(W_buf, result_cl);
    
    for (std::size_t r = 0; r < result_nocl.rows(); ++r)
    {
        for (std::size_t c = 0; c < result_nocl.cols(); ++c)
        {
            ASSERT_NEAR(result_nocl(r, c), result_cl(r, c), 0.001);
        }
    }
 }

 std::vector<std::size_t> maxpool(const fmatrix& input, const std::size_t pool, const std::size_t stride)
 {
    const std::size_t out_rows = (input.rows() / stride - pool + 1) * stride;
    const std::size_t out_cols = input.cols();
    std::vector<std::size_t> ret(out_rows * out_cols);
    
    for (std::size_t i = 0; i < out_rows; ++i)
    {
        for (std::size_t j = 0; j < out_cols; ++j)
        {
            float max = input(i, j);
            std::size_t max_coeff = i + input.rows() * j;

            for (std::size_t k = 1; k < pool; ++k) // && i + stride * k < input.rows()
            {
                float m = input(i + stride * k, j);
                if (m > max)
                {
                    max = m;
                    max_coeff = i + stride * k + input.rows() * j;
                }
            }
            ret[i + out_rows * j] = max_coeff;
        }
    }
    
    return ret;
 }


 fmatrix back_maxpool(const fmatrix& input, const std::vector<std::size_t>& indices, const std::size_t output)
 {
    fmatrix ret = fmatrix::Zero(output, input.cols());
    
    for (std::size_t i = 0; i < input.rows(); ++i)
    {
        for (std::size_t j = 0; j < input.cols(); ++j)
        {
            *(ret.data() + indices[i + input.rows() * j]) += input(i, j);
        }
    }
    
    return ret;
 }


 TEST(CNN, PropPool)
 {
    const std::size_t pool = 2;
    const std::size_t stride = 2;
    const std::size_t input = 6; // NOTE: this means W_conv.rows(), output from convolution ply
    const std::size_t output = (input / stride - pool + 1) * stride; // no overlap
    const std::size_t batch = 2;
    ASSERT_EQ(4, output);
    
    // fold horizontaly in stride, seek verticaly in pool
    fmatrix in0(input,1);
    in0 <<
    1,2,
    3,2,
    1,4;
    fmatrix out0(output,1);
    out0 <<
    3,2,
    3,4;
    
    fmatrix in1(input,1);
    in1 <<
    2,1,
    0,1,
    3,0;
    fmatrix out1(output,1);
    out1 <<
    2,1,
    3,1;
    
    fmatrix in_mat(input, batch);
    in_mat << in0, in1;
    fmatrix out_mat(output, batch);
    out_mat << out0, out1;
    
    std::vector<std::size_t> expected_indices =
    {2, 1, 2, 5,  // in0 -> out0
     0 + input, 1 + input, 4 + input, 3 + input}; // in1 -> out1
    
    
    // NoCL operations
    const auto result_nocl = maxpool(in_mat, pool, stride);
    
    for (std::size_t i = 0; i < expected_indices.size(); ++i)
    {
        ASSERT_EQ(expected_indices[i], result_nocl[i]);
    }
 }


 TEST(CNN, PropPoolRandom)
 {
    const std::size_t pool = 5;
    const std::size_t stride = 33;
    const std::size_t input = 24 * 33; // NOTE: this means W_conv.rows(), output from convolution ply
    const std::size_t output = (input / stride - pool + 1) * stride; // no overlap
    const std::size_t batch = 128;
    fmatrix in_mat = fmatrix::Random(input, batch);
    fmatrix out_mat = fmatrix::Random(output, batch);
    
    // NoCL operations
    const auto result_nocl = maxpool(in_mat, pool, stride);
    

    // CL operations
    cl::Buffer in_buf = copy_to_buffer(in_mat);
    std::vector<int> result_cl(output * batch);
    cl::Buffer out_buf = copy_to_buffer(result_cl); // don't care about elems

    cl::Kernel kernel = kernel_args("max_pool", in_buf, out_buf, input, output, batch, pool, stride);
    cl::NDRange global(output, batch);
    CLS::get_instance().queue.enqueueNDRangeKernel(kernel, cl::NullRange, global);
    copy_from_buffer(out_buf, result_cl);
    
    for (std::size_t i = 0; i < result_nocl.size(); ++i)
    {
        ASSERT_EQ(result_nocl[i], result_cl[i]);
    }
 }


 int main(int argc, const char * argv[])
 {
    ::testing::InitGoogleTest(&argc, (char **)argv);
    return RUN_ALL_TESTS();
 }
diff --git a/TEST result b/TEST result
 [==========] Running 3 tests from 1 test case.
 [----------] Global test environment set-up.
 [----------] 3 tests from CNN
 [ RUN      ] CNN.PropConv
 [       OK ] CNN.PropConv (59 ms)
 [ RUN      ] CNN.PropPool
 [       OK ] CNN.PropPool (0 ms)
 [ RUN      ] CNN.PropPoolRandom
 [       OK ] CNN.PropPoolRandom (4 ms)
 [----------] 3 tests from CNN (63 ms total)

 [----------] Global test environment tear-down
 [==========] 3 tests from 1 test case ran. (63 ms total)
 [  PASSED  ] 3 tests.
	#ifndef OpenCLTest_clutils_hpp
	#define OpenCLTest_clutils_hpp


	#include "cl.hpp"
	#include <type_traits>


	#define __CL_ENABLE_EXCEPTIONS

	/* OpenCL utilities */

	struct CLS // as singleton
	{
	CLS()
	{
	std::vector<cl::Platform> platforms;
	cl::Platform::get(&platforms);

	platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
	ctx = cl::Context(devices);

	std::string bitcode_path = "OpenCL/kernel.cl.gpu_32.bc";
	program = cl::Program(ctx, devices, { {bitcode_path.c_str(), bitcode_path.length() }});
	program.build(devices);

	queue = cl::CommandQueue(ctx, devices[0]);
	}

	public:
	static const CLS& get_instance()
	{
	static CLS cls;
	return cls;
	}

	std::vector<cl::Device> devices;
	cl::Context ctx;
	cl::Program program;
	cl::CommandQueue queue;
	};


	inline void iter_kernel_args(cl::Kernel &k,int i) {} // end of reccursion

	template<class First, class... Rest>
	inline void iter_kernel_args(cl::Kernel &kernel, int i, const First &first, const Rest& ...rest)
	{
	kernel.setArg(i, first);
	iter_kernel_args(kernel, i+1, rest...); // reccur
	}

	template<class... Args>
	cl::Kernel kernel_args(const std::string& name, const Args& ...args)
	{
	cl::Kernel kernel(CLS::get_instance().program, name.c_str());
	iter_kernel_args(kernel, 0, args...);

	return kernel;
	}

	template <class T, typename std::enable_if<std::is_fundamental<T>::value>::type* = nullptr>
	cl::Buffer copy_to_buffer(T& input)
	{
	return cl::Buffer
	(CLS::get_instance().ctx, CL_MEM_READ_WRITE \| CL_MEM_COPY_HOST_PTR, sizeof(T), &input);
	}

	template <class T, typename std::enable_if<std::is_class<T>::value>::type* = nullptr>
	cl::Buffer copy_to_buffer(T& input)
	{
	return cl::Buffer
	(CLS::get_instance().ctx, CL_MEM_READ_WRITE \| CL_MEM_COPY_HOST_PTR,
	sizeof(input.data()) input.size(), input.data());
	}

	template <class T, typename std::enable_if<std::is_fundamental<T>::value>::type* = nullptr>
	void copy_from_buffer(const cl::Buffer& src, T& dst)
	{
	CLS::get_instance().queue.enqueueReadBuffer
	(src, CL_TRUE, 0, sizeof(T), &dst);
	}

	template <class T, typename std::enable_if<std::is_class<T>::value>::type* = nullptr>
	void copy_from_buffer(const cl::Buffer& src, T& dst)
	{
	CLS::get_instance().queue.enqueueReadBuffer
	(src, CL_TRUE, 0, sizeof(dst.data()) dst.size(), dst.data());
	}


	#endif
	__kernel void add_matrix_conv(const __global float* A, __global float* B, const int lda, const int ldb,
	const int output_size, const int input_size, const int filter_size, const int b)
	{
	const int r = get_global_id(0);
	if (r >= output_size) return;
	const int c = get_global_id(1);
	if (c >= input_size * filter_size) return;


	const __global float* A_iter = A + b * (output_size + input_size * lda);
	B[r + ldb * c] += A_iter[r + lda * c];
	}

	__kernel void max_pool(const __global float* input, __global int* output,
	const int input_rows, const int out_rows, const int cols,
	const int pool, const int stride)
	{
	const int i = get_global_id(0);
	if (i >= input_rows) return;
	const int j = get_global_id(1);
	if (j >= cols) return;

	float max = input[i + input_rows * j];
	int max_coeff = i + input_rows * j;

	for (int p = 1; p < pool; ++p)
	{
	float m = input[i + stride * p + input_rows * j];

	if (m > max)
	{
	max = m;
	max_coeff = i + stride * p + input_rows * j;
	}
	}
	output[i + out_rows * j] = max_coeff;
	}

	__kernel void back_max_pool(const __global float* input, __global float* output,
	const int input_rows, const int out_rows, const int cols,
	const __global int* indices)
	{
	const int i = get_global_id(0);
	if (i >= input_rows) return;
	const int j = get_global_id(1);
	if (j >= cols) return;

	output[indices[i + input_rows * j]] += input[i + input_rows * j];
	}
	#include <Eigen/Dense>
	#include "gtest/gtest.h"

	#include "clutils.hpp"


	typedef Eigen::MatrixXf fmatrix;


	TEST(CNN, PropConv)
	{
	// my alternative of CLDriver (clutils.hpp)
	const CLS &cls = CLS::get_instance();

	const std::size_t output_size = 80;
	const std::size_t input_size = 33;
	const std::size_t filter_size = 5;
	const std::size_t band_size = 24;

	// delta of Weights
	fmatrix W = fmatrix::Zero(output_size, input_size * filter_size);
	fmatrix W_conv = fmatrix::Ones(output_size * band_size, input_size * (band_size + filter_size - 1));


	// NoCL operations
	fmatrix result_nocl = W;
	for (std::size_t b = 0; b < band_size; ++b)
	{
	result_nocl += W_conv.block(b * output_size, b * input_size, W.rows(), W.cols());
	}


	// CL operations
	cl::Buffer W_conv_buf = copy_to_buffer(W_conv);
	cl::Buffer W_buf = copy_to_buffer(W);


	for (std::size_t b = 0; b < band_size; ++b)
	{
	// my alternative of BIND macro (clutils.hpp)
	cl::Kernel kernel = kernel_args("add_matrix_conv",
	W_conv_buf, W_buf, W_conv.rows(), W.rows(),
	output_size, input_size, filter_size, b);
	cl::NDRange global(W.rows(), W.cols());
	cls.queue.enqueueNDRangeKernel(kernel, cl::NullRange, global);
	}

	// NoCL vs. CL comparison
	fmatrix result_cl(W.rows(), W.cols());
	copy_from_buffer(W_buf, result_cl);

	for (std::size_t r = 0; r < result_nocl.rows(); ++r)
	{
	for (std::size_t c = 0; c < result_nocl.cols(); ++c)
	{
	ASSERT_NEAR(result_nocl(r, c), result_cl(r, c), 0.001);
	}
	}
	}

	std::vector<std::size_t> maxpool(const fmatrix& input, const std::size_t pool, const std::size_t stride)
	{
	const std::size_t out_rows = (input.rows() / stride - pool + 1) * stride;
	const std::size_t out_cols = input.cols();
	std::vector<std::size_t> ret(out_rows * out_cols);

	for (std::size_t i = 0; i < out_rows; ++i)
	{
	for (std::size_t j = 0; j < out_cols; ++j)
	{
	float max = input(i, j);
	std::size_t max_coeff = i + input.rows() * j;

	for (std::size_t k = 1; k < pool; ++k) // && i + stride * k < input.rows()
	{
	float m = input(i + stride * k, j);
	if (m > max)
	{
	max = m;
	max_coeff = i + stride * k + input.rows() * j;
	}
	}
	ret[i + out_rows * j] = max_coeff;
	}
	}

	return ret;
	}


	fmatrix back_maxpool(const fmatrix& input, const std::vector<std::size_t>& indices, const std::size_t output)
	{
	fmatrix ret = fmatrix::Zero(output, input.cols());

	for (std::size_t i = 0; i < input.rows(); ++i)
	{
	for (std::size_t j = 0; j < input.cols(); ++j)
	{
	(ret.data() + indices[i + input.rows() j]) += input(i, j);
	}
	}

	return ret;
	}


	TEST(CNN, PropPool)
	{
	const std::size_t pool = 2;
	const std::size_t stride = 2;
	const std::size_t input = 6; // NOTE: this means W_conv.rows(), output from convolution ply
	const std::size_t output = (input / stride - pool + 1) * stride; // no overlap
	const std::size_t batch = 2;
	ASSERT_EQ(4, output);

	// fold horizontaly in stride, seek verticaly in pool
	fmatrix in0(input,1);
	in0 <<
	1,2,
	3,2,
	1,4;
	fmatrix out0(output,1);
	out0 <<
	3,2,
	3,4;

	fmatrix in1(input,1);
	in1 <<
	2,1,
	0,1,
	3,0;
	fmatrix out1(output,1);
	out1 <<
	2,1,
	3,1;

	fmatrix in_mat(input, batch);
	in_mat << in0, in1;
	fmatrix out_mat(output, batch);
	out_mat << out0, out1;

	std::vector<std::size_t> expected_indices =
	{2, 1, 2, 5, // in0 -> out0
	0 + input, 1 + input, 4 + input, 3 + input}; // in1 -> out1


	// NoCL operations
	const auto result_nocl = maxpool(in_mat, pool, stride);

	for (std::size_t i = 0; i < expected_indices.size(); ++i)
	{
	ASSERT_EQ(expected_indices[i], result_nocl[i]);
	}
	}


	TEST(CNN, PropPoolRandom)
	{
	const std::size_t pool = 5;
	const std::size_t stride = 33;
	const std::size_t input = 24 * 33; // NOTE: this means W_conv.rows(), output from convolution ply
	const std::size_t output = (input / stride - pool + 1) * stride; // no overlap
	const std::size_t batch = 128;
	fmatrix in_mat = fmatrix::Random(input, batch);
	fmatrix out_mat = fmatrix::Random(output, batch);

	// NoCL operations
	const auto result_nocl = maxpool(in_mat, pool, stride);


	// CL operations
	cl::Buffer in_buf = copy_to_buffer(in_mat);
	std::vector<int> result_cl(output * batch);
	cl::Buffer out_buf = copy_to_buffer(result_cl); // don't care about elems

	cl::Kernel kernel = kernel_args("max_pool", in_buf, out_buf, input, output, batch, pool, stride);
	cl::NDRange global(output, batch);
	CLS::get_instance().queue.enqueueNDRangeKernel(kernel, cl::NullRange, global);
	copy_from_buffer(out_buf, result_cl);

	for (std::size_t i = 0; i < result_nocl.size(); ++i)
	{
	ASSERT_EQ(result_nocl[i], result_cl[i]);
	}
	}


	int main(int argc, const char * argv[])
	{
	::testing::InitGoogleTest(&argc, (char **)argv);
	return RUN_ALL_TESTS();
	}
	[==========] Running 3 tests from 1 test case.
	[----------] Global test environment set-up.
	[----------] 3 tests from CNN
	[ RUN ] CNN.PropConv
	[ OK ] CNN.PropConv (59 ms)
	[ RUN ] CNN.PropPool
	[ OK ] CNN.PropPool (0 ms)
	[ RUN ] CNN.PropPoolRandom
	[ OK ] CNN.PropPoolRandom (4 ms)
	[----------] 3 tests from CNN (63 ms total)

	[----------] Global test environment tear-down
	[==========] 3 tests from 1 test case ran. (63 ms total)
	[ PASSED ] 3 tests.