Last active
August 29, 2015 13:57
-
-
Save nattoheaven/9369884 to your computer and use it in GitHub Desktop.
Test for HSA Example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <string> | |
#include <ctime> | |
#define _mm_malloc(a, b) _aligned_malloc(a, b) | |
#include <CL/cl.h> | |
const char triad_kernel[] = | |
"__kernel void\n" | |
"triad(__global float *c,\n" | |
"__global const float *a,\n" | |
"__global const float *b)\n" | |
"{\n" | |
"size_t i;\n" | |
"i = get_global_id(0);\n" | |
"c[i] += a[i] * b[i];\n" | |
"}\n"; | |
int | |
main() | |
{ | |
cl_int clerr; | |
cl_uint nplatforms; | |
clerr = clGetPlatformIDs(0, 0, &nplatforms); | |
cl_platform_id *platforms = new cl_platform_id[nplatforms]; | |
clerr = clGetPlatformIDs(nplatforms, platforms, 0); | |
for (cl_uint i = 0; i < nplatforms; ++i) { | |
std::cout << "PLATFORM:\t" << i << std::endl; | |
cl_platform_info param_names[] = { | |
CL_PLATFORM_PROFILE, | |
CL_PLATFORM_VERSION, | |
CL_PLATFORM_NAME, | |
CL_PLATFORM_VENDOR, | |
CL_PLATFORM_EXTENSIONS, | |
}; | |
const size_t nparam_names = sizeof(param_names) / sizeof(cl_platform_info); | |
for (size_t j = 0; j < nparam_names; ++j) { | |
size_t param_value_size; | |
clerr = clGetPlatformInfo(platforms[i], | |
param_names[j], | |
0, | |
0, | |
¶m_value_size); | |
char *param_value = new char[param_value_size]; | |
clerr = clGetPlatformInfo(platforms[i], | |
param_names[j], | |
param_value_size, | |
param_value, | |
0); | |
std::cout << "\t" << param_value << std::endl; | |
delete[] param_value; | |
} | |
cl_uint ndevices; | |
clerr = clGetDeviceIDs(platforms[i], | |
CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, | |
0, | |
0, | |
&ndevices); | |
cl_device_id *devices = new cl_device_id[ndevices]; | |
clerr = clGetDeviceIDs(platforms[i], | |
CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, | |
ndevices, | |
devices, | |
0); | |
for (cl_uint j = 0; j < ndevices; ++j) { | |
cl_device_type type; | |
clerr = clGetDeviceInfo(devices[j], | |
CL_DEVICE_TYPE, | |
sizeof(type), | |
&type, | |
0); | |
std::cout << "\tDEVICE:\t" << j << ":\t"; | |
switch (type) { | |
case CL_DEVICE_TYPE_CPU: | |
std::cout << "CPU" << std::endl; | |
break; | |
case CL_DEVICE_TYPE_GPU: | |
std::cout << "GPU" << std::endl; | |
break; | |
default: | |
std::cout << "UNKNOWN" << std::endl; | |
break; | |
} | |
} | |
cl_context_properties context_properties[] = { | |
CL_CONTEXT_PLATFORM, | |
(cl_context_properties) platforms[i], | |
0 | |
}; | |
cl_context context = clCreateContext(context_properties, | |
ndevices, | |
devices, | |
0, | |
0, | |
&clerr); | |
const char *program_sources[] = { triad_kernel }; | |
const size_t program_lengths[] = { sizeof(triad_kernel) }; | |
cl_program program = clCreateProgramWithSource(context, | |
1, | |
program_sources, | |
program_lengths, | |
&clerr); | |
clerr = clBuildProgram(program, | |
ndevices, | |
devices, | |
0, | |
0, | |
0); | |
cl_command_queue *queues = new cl_command_queue[ndevices]; | |
for (cl_uint j = 0; j < ndevices; ++j) { | |
size_t param_value_size; | |
clerr = clGetProgramBuildInfo(program, | |
devices[j], | |
CL_PROGRAM_BUILD_LOG, | |
0, | |
0, | |
¶m_value_size); | |
char *param_value = new char[param_value_size]; | |
clerr = clGetProgramBuildInfo(program, | |
devices[j], | |
CL_PROGRAM_BUILD_LOG, | |
param_value_size, | |
param_value, | |
0); | |
std::cout << param_value << std::endl; | |
delete[] param_value; | |
queues[j] = clCreateCommandQueue(context, | |
devices[j], | |
0, | |
&clerr); | |
} | |
const size_t n = 0x04000000; | |
const size_t alignment = 4096; | |
float *a = | |
reinterpret_cast<float *>(_mm_malloc(n * sizeof(float), alignment)); | |
float *b = | |
reinterpret_cast<float *>(_mm_malloc(n * sizeof(float), alignment)); | |
float *c = | |
reinterpret_cast<float *>(_mm_malloc(n * sizeof(float), alignment)); | |
for (ptrdiff_t j = 0; j < n; ++j) { | |
a[j] = 1.0f; | |
b[j] = 2.0f; | |
c[j] = 0.0f; | |
} | |
cl_mem cl_a = clCreateBuffer(context, | |
CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, | |
n * sizeof(float), | |
a, | |
&clerr); | |
cl_mem cl_b = clCreateBuffer(context, | |
CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, | |
n * sizeof(float), | |
b, | |
&clerr); | |
cl_mem cl_c = clCreateBuffer(context, | |
CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, | |
n * sizeof(float), | |
c, | |
&clerr); | |
cl_kernel kernel = clCreateKernel(program, | |
"triad", | |
&clerr); | |
clerr = clSetKernelArg(kernel, | |
0, | |
sizeof(cl_c), | |
&cl_c); | |
clerr = clSetKernelArg(kernel, | |
1, | |
sizeof(cl_a), | |
&cl_a); | |
clerr = clSetKernelArg(kernel, | |
2, | |
sizeof(cl_b), | |
&cl_b); | |
const int niter = 300; | |
unsigned long long *times = new unsigned long long[ndevices + 1]; | |
for (cl_uint j = 0; j < ndevices + 1; ++j) { | |
times[j] = 0; | |
} | |
time_t time0; | |
time0 = time(0); | |
for (int iter = 0; iter < niter; ++iter) { | |
for (cl_uint j = 0; j < ndevices; ++j) { | |
time_t time1 = time0; | |
cl_event event; | |
clerr = clEnqueueNDRangeKernel(queues[j], | |
kernel, | |
1, | |
0, | |
&n, | |
0, | |
0, | |
0, | |
&event); | |
clerr = clWaitForEvents(1, | |
&event); | |
time0 = time(0); | |
times[j] += time0 - time1; | |
} | |
{ | |
time_t time1 = time0; | |
#pragma omp parallel for | |
for (ptrdiff_t j = 0; j < n; ++j) { | |
c[j] += a[j] * b[j]; | |
} | |
time0 = time(0); | |
times[ndevices] += time0 - time1; | |
} | |
} | |
std::cout << "Inter-Device Accesses" << std::endl; | |
for (cl_uint j = 0; j < ndevices + 1; ++j) { | |
std::cout << times[j] << "\tseconds" << std::endl; | |
double gflops = 3.0e-9 * n * niter / times[j]; | |
std::cout << gflops << "\tGFLOPS" << std::endl; | |
double gbs = 3.0e-9 * n * niter * sizeof(float) / times[j]; | |
std::cout << gbs << "\tGB/s" << std::endl; | |
} | |
for (cl_uint j = 0; j < ndevices + 1; ++j) { | |
times[j] = 0; | |
} | |
time0 = time(0); | |
for (cl_uint j = 0; j < ndevices; ++j) { | |
for (int iter = 0; iter < niter; ++iter) { | |
time_t time1 = time0; | |
cl_event event; | |
clerr = clEnqueueNDRangeKernel(queues[j], | |
kernel, | |
1, | |
0, | |
&n, | |
0, | |
0, | |
0, | |
&event); | |
clerr = clWaitForEvents(1, | |
&event); | |
time0 = time(0); | |
times[j] += time0 - time1; | |
} | |
} | |
{ | |
for (int iter = 0; iter < niter; ++iter) { | |
time_t time1 = time0; | |
#pragma omp parallel for | |
for (ptrdiff_t j = 0; j < n; ++j) { | |
c[j] += a[j] * b[j]; | |
} | |
time0 = time(0); | |
times[ndevices] += time0 - time1; | |
} | |
} | |
std::cout << "Intra-Device Accesses" << std::endl; | |
for (cl_uint j = 0; j < ndevices + 1; ++j) { | |
std::cout << times[j] << "\tseconds" << std::endl; | |
double gflops = 3.0e-9 * n * niter / times[j]; | |
std::cout << gflops << "\tGFLOPS" << std::endl; | |
double gbs = 3.0e-9 * n * niter * sizeof(float) / times[j]; | |
std::cout << gbs << "\tGB/s" << std::endl; | |
} | |
double sum = 0.0; | |
for (ptrdiff_t j = 0; j < n; ++j) { | |
sum += c[j]; | |
} | |
std::cout << sum << std::endl; | |
std::cout << 2.0 * n * niter * (ndevices + 1) * 2 << std::endl; | |
} | |
std::string wait; | |
std::cin >> wait; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment