Created
May 14, 2020 09:42
-
-
Save mcleary/2ecdf5edbe6d16426655ab1711a3039f to your computer and use it in GitHub Desktop.
Simple OpenCL example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <ctime> | |
#ifdef __APPLE__ | |
#include <OpenCL/cl.hpp> | |
#else | |
#include <CL/cl.hpp> | |
#endif | |
#define NUM_GLOBAL_WITEMS 1024 | |
void compareResults(double CPUtime, double GPUtime, int trial) { | |
double time_ratio = (CPUtime / GPUtime); | |
std::cout << "VERSION " << trial << " -----------" << std::endl; | |
std::cout << "CPU time: " << CPUtime << std::endl; | |
std::cout << "GPU time: " << GPUtime << std::endl; | |
std::cout << "GPU is "; | |
if (time_ratio > 1) | |
std::cout << time_ratio << " times faster!" << std::endl; | |
else | |
std::cout << (1 / time_ratio) << " times slower :(" << std::endl; | |
} | |
double timeAddVectorsCPU(int n, int k) { | |
// adds two vectors of size n, k times, returns total duration | |
std::clock_t start; | |
double duration; | |
std::vector<int> A(n); | |
std::vector<int> B(n); | |
std::vector<int> C(n); | |
for (int i = 0; i < n; i++) { | |
A[i] = i; | |
B[i] = n - i; | |
C[i] = 0; | |
} | |
start = std::clock(); | |
for (int i = 0; i < k; i++) { | |
for (int j = 0; j < n; j++) | |
C[j] = A[j] + B[j]; | |
} | |
duration = (std::clock() - start) / (double)CLOCKS_PER_SEC; | |
return duration; | |
} | |
void warmup(cl::Context& context, cl::CommandQueue& queue, | |
cl::Kernel& add, int A[], int B[], int n) { | |
std::vector<int> C(n); | |
// allocate space | |
cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
// push write commands to queue | |
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * n, A); | |
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * n, B); | |
// RUN ZE KERNEL | |
add.setArg(1, buffer_B); | |
add.setArg(0, buffer_A); | |
add.setArg(2, buffer_C); | |
for (int i = 0; i < 5; i++) | |
queue.enqueueNDRangeKernel(add, cl::NullRange, cl::NDRange(NUM_GLOBAL_WITEMS), cl::NDRange(32)); | |
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * n, C.data()); | |
queue.finish(); | |
} | |
int main(int argc, char* argv[]) | |
{ | |
bool verbose; | |
if (argc == 1 || std::strcmp(argv[1], "0") == 0) | |
verbose = true; | |
else | |
verbose = false; | |
verbose = 1; | |
const int n = 8 * 32 * 512; // size of vectors | |
const int k = 1000; // number of loop iterations | |
// const int NUM_GLOBAL_WITEMS = 1024; // number of threads | |
// get all platforms (drivers), e.g. NVIDIA | |
std::vector<cl::Platform> all_platforms; | |
cl::Platform::get(&all_platforms); | |
if (all_platforms.size() == 0) { | |
std::cout << " No platforms found. Check OpenCL installation!\n"; | |
exit(1); | |
} | |
cl::Platform default_platform = all_platforms[1]; | |
std::cout << "Using platform: "<<default_platform.getInfo<CL_PLATFORM_NAME>()<<"\n"; | |
// get default device (CPUs, GPUs) of the default platform | |
std::vector<cl::Device> all_devices; | |
default_platform.getDevices(CL_DEVICE_TYPE_GPU, &all_devices); | |
if (all_devices.size() == 0) { | |
std::cout << " No devices found. Check OpenCL installation!\n"; | |
exit(1); | |
} | |
cl::Device default_device = all_devices[0]; | |
std::cout<< "Using device: "<<default_device.getInfo<CL_DEVICE_NAME>()<<"\n"; | |
cl::Context context({ default_device }); | |
cl::Program::Sources sources; | |
// calculates for each element; C = A + B | |
std::string kernel_code = | |
" void kernel add(global const int* v1, global const int* v2, global int* v3) {" | |
" int ID;" | |
" ID = get_global_id(0);" | |
" v3[ID] = v1[ID] + v2[ID];" | |
" }" | |
"" | |
" void kernel add_looped_1(global const int* v1, global const int* v2, global int* v3, " | |
" const int n, const int k) {" | |
" int ID, NUM_GLOBAL_WITEMS, ratio, start, stop;" | |
" ID = get_global_id(0);" | |
" NUM_GLOBAL_WITEMS = get_global_size(0);" | |
"" | |
" ratio = (n / NUM_GLOBAL_WITEMS);" // elements per thread | |
" start = ratio * ID;" | |
" stop = ratio * (ID+1);" | |
"" | |
" int i, j;" // will the compiler optimize this anyway? probably. | |
" for (i=0; i<k; i++) {" | |
" for (j=start; j<stop; j++)" | |
" v3[j] = v1[j] + v2[j];" | |
" }" | |
" }" | |
"" | |
" void kernel add_looped_2(global const int* v1, global const int* v2, global int* v3," | |
" const int n, const int k) {" | |
" int ID, NUM_GLOBAL_WITEMS, step;" | |
" ID = get_global_id(0);" | |
" NUM_GLOBAL_WITEMS = get_global_size(0);" | |
" step = (n / NUM_GLOBAL_WITEMS);" | |
"" | |
" int i,j;" | |
" for (i=0; i<k; i++) {" | |
" for (j=ID; j<n; j+=step)" | |
" v3[j] = v1[j] + v2[j];" | |
" }" | |
" }" | |
"" | |
" void kernel add_single(global const int* v1, global const int* v2, global int* v3, " | |
" const int k) { " | |
" int ID = get_global_id(0);" | |
" for (int i=0; i<k; i++)" | |
" v3[ID] = v1[ID] + v2[ID];" | |
" }"; | |
sources.push_back({ kernel_code.c_str(), kernel_code.length() }); | |
cl::Program program(context, sources); | |
if (program.build({ default_device }) != CL_SUCCESS) { | |
std::cout << "Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << std::endl; | |
exit(1); | |
} | |
// run the CPU code | |
float CPUtime = timeAddVectorsCPU(n, k); | |
// set up kernels and vectors for GPU code | |
cl::CommandQueue queue(context, default_device); | |
cl::Kernel add = cl::Kernel(program, "add"); | |
cl::Kernel add_looped_1 = cl::Kernel(program, "add_looped_1"); | |
cl::Kernel add_looped_2 = cl::Kernel(program, "add_looped_2"); | |
cl::Kernel add_single = cl::Kernel(program, "add_single"); | |
// construct vectors | |
std::vector<int> A(n); | |
std::vector<int> B(n); | |
std::vector<int> C(n); | |
for (int i = 0; i < n; i++) { | |
A[i] = i; | |
B[i] = n - i - 1; | |
} | |
// attempt at warm-up... | |
warmup(context, queue, add, A.data(), B.data(), n); | |
queue.finish(); | |
std::clock_t start_time; | |
// VERSION 1 ========================================== | |
// start timer | |
double GPUtime1; | |
start_time = std::clock(); | |
// allocate space | |
cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
// push write commands to queue | |
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * n, A.data()); | |
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * n, B.data()); | |
// RUN ZE KERNEL | |
add_looped_1.setArg(0, buffer_A); | |
add_looped_1.setArg(1, buffer_B); | |
add_looped_1.setArg(2, buffer_C); | |
add_looped_1.setArg(3, n); | |
add_looped_1.setArg(4, k); | |
queue.enqueueNDRangeKernel(add_looped_1, cl::NullRange, // kernel, offset | |
cl::NDRange(NUM_GLOBAL_WITEMS), // global number of work items | |
cl::NDRange(32)); // local number (per group) | |
// read result from GPU to here; including for the sake of timing | |
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * n, C.data()); | |
queue.finish(); | |
GPUtime1 = (std::clock() - start_time) / (double)CLOCKS_PER_SEC; | |
// VERSION 2 ========================================== | |
double GPUtime2; | |
cl::Buffer buffer_A2(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
cl::Buffer buffer_B2(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
cl::Buffer buffer_C2(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
queue.enqueueWriteBuffer(buffer_A2, CL_TRUE, 0, sizeof(int) * n, A.data()); | |
queue.enqueueWriteBuffer(buffer_B2, CL_TRUE, 0, sizeof(int) * n, B.data()); | |
start_time = std::clock(); | |
add_looped_2.setArg(0, buffer_A2); | |
add_looped_2.setArg(1, buffer_B2); | |
add_looped_2.setArg(2, buffer_C2); | |
add_looped_2.setArg(3, n); | |
add_looped_2.setArg(4, k); | |
queue.enqueueNDRangeKernel(add_looped_2, cl::NullRange, cl::NDRange(NUM_GLOBAL_WITEMS), cl::NDRange(32)); | |
queue.enqueueReadBuffer(buffer_C2, CL_TRUE, 0, sizeof(int) * n, C.data()); | |
queue.finish(); | |
GPUtime2 = (std::clock() - start_time) / (double)CLOCKS_PER_SEC; | |
// let's compare! | |
const int NUM_VERSIONS = 2; | |
double GPUtimes[NUM_VERSIONS] = { GPUtime1, GPUtime2 }; | |
if (verbose) { | |
for (int i = 0; i < NUM_VERSIONS; i++) | |
compareResults(CPUtime, GPUtimes[i], i + 1); | |
} | |
else { | |
std::cout << CPUtime << ","; | |
for (int i = 0; i < NUM_VERSIONS - 1; i++) | |
std::cout << GPUtimes[i] << ","; | |
std::cout << GPUtimes[NUM_VERSIONS - 1] << std::endl; | |
} | |
return 0; | |
} |
It looks like a nice work, well done.
I know that I am nobody and you can completely skip the rest of the email,
but my intentions are not bad, These are only habits I try to get in my
professional projects.
- When I opened some classes, I noticed that the style was not the same
everywhere,
some tools (`clang_format`) could help you to enforce the same format style
as you want (for example in a merge request).
- `virtual` and `override` are redondant, only the second one really
matters. If it overrides, it must be a virtual function ...
…On Tue, 20 Oct 2020 at 20:59, Thales Sabino ***@***.***> wrote:
***@***.**** commented on this gist.
------------------------------
I started to read you code because of your Timer class and you don't even
use it here ;-)
https://gist.github.com/mcleary/b0bf4fa88830ff7c882d
That is true, however, this OpenCL sample was meant to be used as single
file to test OpenCL compilation and stuff like that. I did the Timer class
as a gist for quick reference, but I do use a similar version in my
Atmosphere demo here:
https://github.com/mcleary/pbr/blob/master/pbr/main.cpp
—
You are receiving this because you commented.
Reply to this email directly, view it on GitHub
<https://gist.github.com/2ecdf5edbe6d16426655ab1711a3039f#gistcomment-3496835>,
or unsubscribe
<https://github.com/notifications/unsubscribe-auth/ACA35DPZ7ZBZZN5YWPZ4IMDSLXM2TANCNFSM4SYXDAVQ>
.
Don't worry, all feedback is welcomed.
- When I opened some classes, I noticed that the style was not the same
everywhere,
I know that and I also use clang_format extensively but I didn't bother to use it in my personal project
virtual
andoverride
are redondant, only the second one really
matters. If it overrides, it must be a virtual function
I know that as well but I probably didn't at the time I wrote the code for the first time
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
That is true, however, this OpenCL sample was meant to be used as single file to test OpenCL compilation and stuff like that. I did the Timer class as a gist for quick reference, but I do use a similar version in my Atmosphere demo here: https://github.com/mcleary/pbr/blob/master/pbr/main.cpp
Another thing, I wrote that Timer class 4 years ago. There was a lot I didn't know at the time on how to use
std::chrono