-
-
Save mcleary/2ecdf5edbe6d16426655ab1711a3039f to your computer and use it in GitHub Desktop.
#include <iostream> | |
#include <ctime> | |
#ifdef __APPLE__ | |
#include <OpenCL/cl.hpp> | |
#else | |
#include <CL/cl.hpp> | |
#endif | |
#define NUM_GLOBAL_WITEMS 1024 | |
void compareResults(double CPUtime, double GPUtime, int trial) { | |
double time_ratio = (CPUtime / GPUtime); | |
std::cout << "VERSION " << trial << " -----------" << std::endl; | |
std::cout << "CPU time: " << CPUtime << std::endl; | |
std::cout << "GPU time: " << GPUtime << std::endl; | |
std::cout << "GPU is "; | |
if (time_ratio > 1) | |
std::cout << time_ratio << " times faster!" << std::endl; | |
else | |
std::cout << (1 / time_ratio) << " times slower :(" << std::endl; | |
} | |
double timeAddVectorsCPU(int n, int k) { | |
// adds two vectors of size n, k times, returns total duration | |
std::clock_t start; | |
double duration; | |
std::vector<int> A(n); | |
std::vector<int> B(n); | |
std::vector<int> C(n); | |
for (int i = 0; i < n; i++) { | |
A[i] = i; | |
B[i] = n - i; | |
C[i] = 0; | |
} | |
start = std::clock(); | |
for (int i = 0; i < k; i++) { | |
for (int j = 0; j < n; j++) | |
C[j] = A[j] + B[j]; | |
} | |
duration = (std::clock() - start) / (double)CLOCKS_PER_SEC; | |
return duration; | |
} | |
void warmup(cl::Context& context, cl::CommandQueue& queue, | |
cl::Kernel& add, int A[], int B[], int n) { | |
std::vector<int> C(n); | |
// allocate space | |
cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
// push write commands to queue | |
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * n, A); | |
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * n, B); | |
// RUN ZE KERNEL | |
add.setArg(1, buffer_B); | |
add.setArg(0, buffer_A); | |
add.setArg(2, buffer_C); | |
for (int i = 0; i < 5; i++) | |
queue.enqueueNDRangeKernel(add, cl::NullRange, cl::NDRange(NUM_GLOBAL_WITEMS), cl::NDRange(32)); | |
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * n, C.data()); | |
queue.finish(); | |
} | |
int main(int argc, char* argv[]) | |
{ | |
bool verbose; | |
if (argc == 1 || std::strcmp(argv[1], "0") == 0) | |
verbose = true; | |
else | |
verbose = false; | |
verbose = 1; | |
const int n = 8 * 32 * 512; // size of vectors | |
const int k = 1000; // number of loop iterations | |
// const int NUM_GLOBAL_WITEMS = 1024; // number of threads | |
// get all platforms (drivers), e.g. NVIDIA | |
std::vector<cl::Platform> all_platforms; | |
cl::Platform::get(&all_platforms); | |
if (all_platforms.size() == 0) { | |
std::cout << " No platforms found. Check OpenCL installation!\n"; | |
exit(1); | |
} | |
cl::Platform default_platform = all_platforms[1]; | |
std::cout << "Using platform: "<<default_platform.getInfo<CL_PLATFORM_NAME>()<<"\n"; | |
// get default device (CPUs, GPUs) of the default platform | |
std::vector<cl::Device> all_devices; | |
default_platform.getDevices(CL_DEVICE_TYPE_GPU, &all_devices); | |
if (all_devices.size() == 0) { | |
std::cout << " No devices found. Check OpenCL installation!\n"; | |
exit(1); | |
} | |
cl::Device default_device = all_devices[0]; | |
std::cout<< "Using device: "<<default_device.getInfo<CL_DEVICE_NAME>()<<"\n"; | |
cl::Context context({ default_device }); | |
cl::Program::Sources sources; | |
// calculates for each element; C = A + B | |
std::string kernel_code = | |
" void kernel add(global const int* v1, global const int* v2, global int* v3) {" | |
" int ID;" | |
" ID = get_global_id(0);" | |
" v3[ID] = v1[ID] + v2[ID];" | |
" }" | |
"" | |
" void kernel add_looped_1(global const int* v1, global const int* v2, global int* v3, " | |
" const int n, const int k) {" | |
" int ID, NUM_GLOBAL_WITEMS, ratio, start, stop;" | |
" ID = get_global_id(0);" | |
" NUM_GLOBAL_WITEMS = get_global_size(0);" | |
"" | |
" ratio = (n / NUM_GLOBAL_WITEMS);" // elements per thread | |
" start = ratio * ID;" | |
" stop = ratio * (ID+1);" | |
"" | |
" int i, j;" // will the compiler optimize this anyway? probably. | |
" for (i=0; i<k; i++) {" | |
" for (j=start; j<stop; j++)" | |
" v3[j] = v1[j] + v2[j];" | |
" }" | |
" }" | |
"" | |
" void kernel add_looped_2(global const int* v1, global const int* v2, global int* v3," | |
" const int n, const int k) {" | |
" int ID, NUM_GLOBAL_WITEMS, step;" | |
" ID = get_global_id(0);" | |
" NUM_GLOBAL_WITEMS = get_global_size(0);" | |
" step = (n / NUM_GLOBAL_WITEMS);" | |
"" | |
" int i,j;" | |
" for (i=0; i<k; i++) {" | |
" for (j=ID; j<n; j+=step)" | |
" v3[j] = v1[j] + v2[j];" | |
" }" | |
" }" | |
"" | |
" void kernel add_single(global const int* v1, global const int* v2, global int* v3, " | |
" const int k) { " | |
" int ID = get_global_id(0);" | |
" for (int i=0; i<k; i++)" | |
" v3[ID] = v1[ID] + v2[ID];" | |
" }"; | |
sources.push_back({ kernel_code.c_str(), kernel_code.length() }); | |
cl::Program program(context, sources); | |
if (program.build({ default_device }) != CL_SUCCESS) { | |
std::cout << "Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << std::endl; | |
exit(1); | |
} | |
// run the CPU code | |
float CPUtime = timeAddVectorsCPU(n, k); | |
// set up kernels and vectors for GPU code | |
cl::CommandQueue queue(context, default_device); | |
cl::Kernel add = cl::Kernel(program, "add"); | |
cl::Kernel add_looped_1 = cl::Kernel(program, "add_looped_1"); | |
cl::Kernel add_looped_2 = cl::Kernel(program, "add_looped_2"); | |
cl::Kernel add_single = cl::Kernel(program, "add_single"); | |
// construct vectors | |
std::vector<int> A(n); | |
std::vector<int> B(n); | |
std::vector<int> C(n); | |
for (int i = 0; i < n; i++) { | |
A[i] = i; | |
B[i] = n - i - 1; | |
} | |
// attempt at warm-up... | |
warmup(context, queue, add, A.data(), B.data(), n); | |
queue.finish(); | |
std::clock_t start_time; | |
// VERSION 1 ========================================== | |
// start timer | |
double GPUtime1; | |
start_time = std::clock(); | |
// allocate space | |
cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
// push write commands to queue | |
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * n, A.data()); | |
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * n, B.data()); | |
// RUN ZE KERNEL | |
add_looped_1.setArg(0, buffer_A); | |
add_looped_1.setArg(1, buffer_B); | |
add_looped_1.setArg(2, buffer_C); | |
add_looped_1.setArg(3, n); | |
add_looped_1.setArg(4, k); | |
queue.enqueueNDRangeKernel(add_looped_1, cl::NullRange, // kernel, offset | |
cl::NDRange(NUM_GLOBAL_WITEMS), // global number of work items | |
cl::NDRange(32)); // local number (per group) | |
// read result from GPU to here; including for the sake of timing | |
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * n, C.data()); | |
queue.finish(); | |
GPUtime1 = (std::clock() - start_time) / (double)CLOCKS_PER_SEC; | |
// VERSION 2 ========================================== | |
double GPUtime2; | |
cl::Buffer buffer_A2(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
cl::Buffer buffer_B2(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
cl::Buffer buffer_C2(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
queue.enqueueWriteBuffer(buffer_A2, CL_TRUE, 0, sizeof(int) * n, A.data()); | |
queue.enqueueWriteBuffer(buffer_B2, CL_TRUE, 0, sizeof(int) * n, B.data()); | |
start_time = std::clock(); | |
add_looped_2.setArg(0, buffer_A2); | |
add_looped_2.setArg(1, buffer_B2); | |
add_looped_2.setArg(2, buffer_C2); | |
add_looped_2.setArg(3, n); | |
add_looped_2.setArg(4, k); | |
queue.enqueueNDRangeKernel(add_looped_2, cl::NullRange, cl::NDRange(NUM_GLOBAL_WITEMS), cl::NDRange(32)); | |
queue.enqueueReadBuffer(buffer_C2, CL_TRUE, 0, sizeof(int) * n, C.data()); | |
queue.finish(); | |
GPUtime2 = (std::clock() - start_time) / (double)CLOCKS_PER_SEC; | |
// let's compare! | |
const int NUM_VERSIONS = 2; | |
double GPUtimes[NUM_VERSIONS] = { GPUtime1, GPUtime2 }; | |
if (verbose) { | |
for (int i = 0; i < NUM_VERSIONS; i++) | |
compareResults(CPUtime, GPUtimes[i], i + 1); | |
} | |
else { | |
std::cout << CPUtime << ","; | |
for (int i = 0; i < NUM_VERSIONS - 1; i++) | |
std::cout << GPUtimes[i] << ","; | |
std::cout << GPUtimes[NUM_VERSIONS - 1] << std::endl; | |
} | |
return 0; | |
} |
I started to read you code because of your Timer class and you don't even use it here ;-)
https://gist.github.com/mcleary/b0bf4fa88830ff7c882d
That is true, however, this OpenCL sample was meant to be used as single file to test OpenCL compilation and stuff like that. I did the Timer class as a gist for quick reference, but I do use a similar version in my Atmosphere demo here: https://github.com/mcleary/pbr/blob/master/pbr/main.cpp
Another thing, I wrote that Timer class 4 years ago. There was a lot I didn't know at the time on how to use std::chrono
Don't worry, all feedback is welcomed.
- When I opened some classes, I noticed that the style was not the same
everywhere,
I know that and I also use clang_format extensively but I didn't bother to use it in my personal project
virtual
andoverride
are redondant, only the second one really
matters. If it overrides, it must be a virtual function
I know that as well but I probably didn't at the time I wrote the code for the first time
I started to read you code because of your Timer class and you don't even use it here ;-)
https://gist.github.com/mcleary/b0bf4fa88830ff7c882d