nattoheaven · August 29, 2015 13:57
diff --git a/gistfile1.cpp b/gistfile1.cpp
 #include <iostream>
 #include <string>
 #include <ctime>

 #define _mm_malloc(a, b) _aligned_malloc(a, b)

 #include <CL/cl.h>

 const char triad_kernel[] =
    "__kernel void\n"
    "triad(__global float *c,\n"
           "__global const float *a,\n"
           "__global const float *b)\n"
    "{\n"
      "size_t i;\n"
      "i = get_global_id(0);\n"
      "c[i] += a[i] * b[i];\n"
    "}\n";

 int
 main()
 {
  cl_int clerr;
  cl_uint nplatforms;
  clerr = clGetPlatformIDs(0, 0, &nplatforms);
  cl_platform_id *platforms = new cl_platform_id[nplatforms];
  clerr = clGetPlatformIDs(nplatforms, platforms, 0);
  for (cl_uint i = 0; i < nplatforms; ++i) {
    std::cout << "PLATFORM:\t" << i << std::endl;
    cl_platform_info param_names[] = {
      CL_PLATFORM_PROFILE,
      CL_PLATFORM_VERSION,
      CL_PLATFORM_NAME,
      CL_PLATFORM_VENDOR,
      CL_PLATFORM_EXTENSIONS,
    };
    const size_t nparam_names = sizeof(param_names) / sizeof(cl_platform_info);
    for (size_t j = 0; j < nparam_names; ++j) {
      size_t param_value_size;
      clerr = clGetPlatformInfo(platforms[i],
                                param_names[j],
                                0,
                                0,
                                &param_value_size);
      char *param_value = new char[param_value_size];
      clerr = clGetPlatformInfo(platforms[i],
                                param_names[j],
                                param_value_size,
                                param_value,
                                0);
      std::cout << "\t" << param_value << std::endl;
      delete[] param_value;
    }

    cl_uint ndevices;
    clerr = clGetDeviceIDs(platforms[i],
                           CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU,
                           0,
                           0,
                           &ndevices);
    cl_device_id *devices = new cl_device_id[ndevices];
    clerr = clGetDeviceIDs(platforms[i],
                           CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU,
                           ndevices,
                           devices,
                           0);
    for (cl_uint j = 0; j < ndevices; ++j) {
      cl_device_type type;
      clerr = clGetDeviceInfo(devices[j],
                              CL_DEVICE_TYPE,
                              sizeof(type),
                              &type,
                              0);
      std::cout << "\tDEVICE:\t" << j << ":\t";
      switch (type) {
      case CL_DEVICE_TYPE_CPU:
        std::cout << "CPU" << std::endl;
        break;
      case CL_DEVICE_TYPE_GPU:
        std::cout << "GPU" << std::endl;
        break;
      default:
        std::cout << "UNKNOWN" << std::endl;
        break;
      }
    }
    cl_context_properties context_properties[] = {
      CL_CONTEXT_PLATFORM,
      (cl_context_properties) platforms[i],
      0
    };
    cl_context context = clCreateContext(context_properties,
                                         ndevices,
                                         devices,
                                         0,
                                         0,
                                         &clerr);
    const char *program_sources[] = { triad_kernel };
    const size_t program_lengths[] = { sizeof(triad_kernel) };
    cl_program program = clCreateProgramWithSource(context,
                                                   1,
                                                   program_sources,
                                                   program_lengths,
                                                   &clerr);
    clerr = clBuildProgram(program,
                           ndevices,
                           devices,
                           0,
                           0,
                           0);
    cl_command_queue *queues = new cl_command_queue[ndevices];
    for (cl_uint j = 0; j < ndevices; ++j) {
      size_t param_value_size;
      clerr = clGetProgramBuildInfo(program,
                                    devices[j],
                                    CL_PROGRAM_BUILD_LOG,
                                    0,
                                    0,
                                    &param_value_size);
      char *param_value = new char[param_value_size];
      clerr = clGetProgramBuildInfo(program,
                                    devices[j],
                                    CL_PROGRAM_BUILD_LOG,
                                    param_value_size,
                                    param_value,
                                    0);
      std::cout << param_value << std::endl;
      delete[] param_value;
      queues[j] = clCreateCommandQueue(context,
                                       devices[j],
                                       0,
                                       &clerr);
    }
    const size_t n = 0x04000000;
    const size_t alignment = 4096;
    float *a =
      reinterpret_cast<float *>(_mm_malloc(n * sizeof(float), alignment));
    float *b =
      reinterpret_cast<float *>(_mm_malloc(n * sizeof(float), alignment));
    float *c =
      reinterpret_cast<float *>(_mm_malloc(n * sizeof(float), alignment));
    for (ptrdiff_t j = 0; j < n; ++j) {
      a[j] = 1.0f;
      b[j] = 2.0f;
      c[j] = 0.0f;
    }
    cl_mem cl_a = clCreateBuffer(context,
                                 CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                                 n * sizeof(float),
                                 a,
                                 &clerr);
    cl_mem cl_b = clCreateBuffer(context,
                                 CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                                 n * sizeof(float),
                                 b,
                                 &clerr);
    cl_mem cl_c = clCreateBuffer(context,
                                 CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                                 n * sizeof(float),
                                 c,
                                 &clerr);
    cl_kernel kernel = clCreateKernel(program,
                                      "triad",
                                      &clerr);
    clerr = clSetKernelArg(kernel,
                           0,
                           sizeof(cl_c),
                           &cl_c);
    clerr = clSetKernelArg(kernel,
                           1,
                           sizeof(cl_a),
                           &cl_a);
    clerr = clSetKernelArg(kernel,
                           2,
                           sizeof(cl_b),
                           &cl_b);
    const int niter = 300;
    unsigned long long *times = new unsigned long long[ndevices + 1];
    for (cl_uint j = 0; j < ndevices + 1; ++j) {
      times[j] = 0;
    }
    time_t time0;
    time0 = time(0);
    for (int iter = 0; iter < niter; ++iter) {
      for (cl_uint j = 0; j < ndevices; ++j) {
        time_t time1 = time0;
        cl_event event;
        clerr = clEnqueueNDRangeKernel(queues[j],
                                       kernel,
                                       1,
                                       0,
                                       &n,
                                       0,
                                       0,
                                       0,
                                       &event);
        clerr = clWaitForEvents(1,
 				&event);
        time0 = time(0);
        times[j] += time0 - time1;
      }
      {
        time_t time1 = time0;
 #pragma omp parallel for
        for (ptrdiff_t j = 0; j < n; ++j) {
          c[j] += a[j] * b[j];
        }
        time0 = time(0);
        times[ndevices] += time0 - time1;
      }
    }
    std::cout << "Inter-Device Accesses" << std::endl;
    for (cl_uint j = 0; j < ndevices + 1; ++j) {
      std::cout << times[j] << "\tseconds" << std::endl;
      double gflops = 3.0e-9 * n * niter / times[j];
      std::cout << gflops << "\tGFLOPS" << std::endl;
      double gbs = 3.0e-9 * n * niter * sizeof(float) / times[j];
      std::cout << gbs << "\tGB/s" << std::endl;
    }
    for (cl_uint j = 0; j < ndevices + 1; ++j) {
      times[j] = 0;
    }
    time0 = time(0);
    for (cl_uint j = 0; j < ndevices; ++j) {
      for (int iter = 0; iter < niter; ++iter) {
        time_t time1 = time0;
        cl_event event;
        clerr = clEnqueueNDRangeKernel(queues[j],
                                       kernel,
                                       1,
                                       0,
                                       &n,
                                       0,
                                       0,
                                       0,
                                       &event);
        clerr = clWaitForEvents(1,
 				&event);
        time0 = time(0);
        times[j] += time0 - time1;
      }
    }
    {
      for (int iter = 0; iter < niter; ++iter) {
        time_t time1 = time0;
 #pragma omp parallel for
        for (ptrdiff_t j = 0; j < n; ++j) {
          c[j] += a[j] * b[j];
        }
        time0 = time(0);
        times[ndevices] += time0 - time1;
      }
    }
    std::cout << "Intra-Device Accesses" << std::endl;
    for (cl_uint j = 0; j < ndevices + 1; ++j) {
      std::cout << times[j] << "\tseconds" << std::endl;
      double gflops = 3.0e-9 * n * niter / times[j];
      std::cout << gflops << "\tGFLOPS" << std::endl;
      double gbs = 3.0e-9 * n * niter * sizeof(float) / times[j];
      std::cout << gbs << "\tGB/s" << std::endl;
    }
    double sum = 0.0;
    for (ptrdiff_t j = 0; j < n; ++j) {
      sum += c[j];
    }
    std::cout << sum << std::endl;
    std::cout << 2.0 * n * niter * (ndevices + 1) * 2 << std::endl;
  }
  std::string wait;
  std::cin >> wait;
  return 0;
 }
	#include <iostream>
	#include <string>
	#include <ctime>

	#define _mm_malloc(a, b) _aligned_malloc(a, b)

	#include <CL/cl.h>

	const char triad_kernel[] =
	"__kernel void\n"
	"triad(__global float *c,\n"
	"__global const float *a,\n"
	"__global const float *b)\n"
	"{\n"
	"size_t i;\n"
	"i = get_global_id(0);\n"
	"c[i] += a[i] * b[i];\n"
	"}\n";

	int
	main()
	{
	cl_int clerr;
	cl_uint nplatforms;
	clerr = clGetPlatformIDs(0, 0, &nplatforms);
	cl_platform_id *platforms = new cl_platform_id[nplatforms];
	clerr = clGetPlatformIDs(nplatforms, platforms, 0);
	for (cl_uint i = 0; i < nplatforms; ++i) {
	std::cout << "PLATFORM:\t" << i << std::endl;
	cl_platform_info param_names[] = {
	CL_PLATFORM_PROFILE,
	CL_PLATFORM_VERSION,
	CL_PLATFORM_NAME,
	CL_PLATFORM_VENDOR,
	CL_PLATFORM_EXTENSIONS,
	};
	const size_t nparam_names = sizeof(param_names) / sizeof(cl_platform_info);
	for (size_t j = 0; j < nparam_names; ++j) {
	size_t param_value_size;
	clerr = clGetPlatformInfo(platforms[i],
	param_names[j],
	0,
	0,
	&param_value_size);
	char *param_value = new char[param_value_size];
	clerr = clGetPlatformInfo(platforms[i],
	param_names[j],
	param_value_size,
	param_value,
	0);
	std::cout << "\t" << param_value << std::endl;
	delete[] param_value;
	}

	cl_uint ndevices;
	clerr = clGetDeviceIDs(platforms[i],
	CL_DEVICE_TYPE_CPU \| CL_DEVICE_TYPE_GPU,
	0,
	0,
	&ndevices);
	cl_device_id *devices = new cl_device_id[ndevices];
	clerr = clGetDeviceIDs(platforms[i],
	CL_DEVICE_TYPE_CPU \| CL_DEVICE_TYPE_GPU,
	ndevices,
	devices,
	0);
	for (cl_uint j = 0; j < ndevices; ++j) {
	cl_device_type type;
	clerr = clGetDeviceInfo(devices[j],
	CL_DEVICE_TYPE,
	sizeof(type),
	&type,
	0);
	std::cout << "\tDEVICE:\t" << j << ":\t";
	switch (type) {
	case CL_DEVICE_TYPE_CPU:
	std::cout << "CPU" << std::endl;
	break;
	case CL_DEVICE_TYPE_GPU:
	std::cout << "GPU" << std::endl;
	break;
	default:
	std::cout << "UNKNOWN" << std::endl;
	break;
	}
	}
	cl_context_properties context_properties[] = {
	CL_CONTEXT_PLATFORM,
	(cl_context_properties) platforms[i],
	0
	};
	cl_context context = clCreateContext(context_properties,
	ndevices,
	devices,
	0,
	0,
	&clerr);
	const char *program_sources[] = { triad_kernel };
	const size_t program_lengths[] = { sizeof(triad_kernel) };
	cl_program program = clCreateProgramWithSource(context,
	1,
	program_sources,
	program_lengths,
	&clerr);
	clerr = clBuildProgram(program,
	ndevices,
	devices,
	0,
	0,
	0);
	cl_command_queue *queues = new cl_command_queue[ndevices];
	for (cl_uint j = 0; j < ndevices; ++j) {
	size_t param_value_size;
	clerr = clGetProgramBuildInfo(program,
	devices[j],
	CL_PROGRAM_BUILD_LOG,
	0,
	0,
	&param_value_size);
	char *param_value = new char[param_value_size];
	clerr = clGetProgramBuildInfo(program,
	devices[j],
	CL_PROGRAM_BUILD_LOG,
	param_value_size,
	param_value,
	0);
	std::cout << param_value << std::endl;
	delete[] param_value;
	queues[j] = clCreateCommandQueue(context,
	devices[j],
	0,
	&clerr);
	}
	const size_t n = 0x04000000;
	const size_t alignment = 4096;
	float *a =
	reinterpret_cast<float >(_mm_malloc(n sizeof(float), alignment));
	float *b =
	reinterpret_cast<float >(_mm_malloc(n sizeof(float), alignment));
	float *c =
	reinterpret_cast<float >(_mm_malloc(n sizeof(float), alignment));
	for (ptrdiff_t j = 0; j < n; ++j) {
	a[j] = 1.0f;
	b[j] = 2.0f;
	c[j] = 0.0f;
	}
	cl_mem cl_a = clCreateBuffer(context,
	CL_MEM_READ_WRITE \| CL_MEM_USE_HOST_PTR,
	n * sizeof(float),
	a,
	&clerr);
	cl_mem cl_b = clCreateBuffer(context,
	CL_MEM_READ_WRITE \| CL_MEM_USE_HOST_PTR,
	n * sizeof(float),
	b,
	&clerr);
	cl_mem cl_c = clCreateBuffer(context,
	CL_MEM_READ_WRITE \| CL_MEM_USE_HOST_PTR,
	n * sizeof(float),
	c,
	&clerr);
	cl_kernel kernel = clCreateKernel(program,
	"triad",
	&clerr);
	clerr = clSetKernelArg(kernel,
	0,
	sizeof(cl_c),
	&cl_c);
	clerr = clSetKernelArg(kernel,
	1,
	sizeof(cl_a),
	&cl_a);
	clerr = clSetKernelArg(kernel,
	2,
	sizeof(cl_b),
	&cl_b);
	const int niter = 300;
	unsigned long long *times = new unsigned long long[ndevices + 1];
	for (cl_uint j = 0; j < ndevices + 1; ++j) {
	times[j] = 0;
	}
	time_t time0;
	time0 = time(0);
	for (int iter = 0; iter < niter; ++iter) {
	for (cl_uint j = 0; j < ndevices; ++j) {
	time_t time1 = time0;
	cl_event event;
	clerr = clEnqueueNDRangeKernel(queues[j],
	kernel,
	1,
	0,
	&n,
	0,
	0,
	0,
	&event);
	clerr = clWaitForEvents(1,
	&event);
	time0 = time(0);
	times[j] += time0 - time1;
	}
	{
	time_t time1 = time0;
	#pragma omp parallel for
	for (ptrdiff_t j = 0; j < n; ++j) {
	c[j] += a[j] * b[j];
	}
	time0 = time(0);
	times[ndevices] += time0 - time1;
	}
	}
	std::cout << "Inter-Device Accesses" << std::endl;
	for (cl_uint j = 0; j < ndevices + 1; ++j) {
	std::cout << times[j] << "\tseconds" << std::endl;
	double gflops = 3.0e-9 * n * niter / times[j];
	std::cout << gflops << "\tGFLOPS" << std::endl;
	double gbs = 3.0e-9 * n * niter * sizeof(float) / times[j];
	std::cout << gbs << "\tGB/s" << std::endl;
	}
	for (cl_uint j = 0; j < ndevices + 1; ++j) {
	times[j] = 0;
	}
	time0 = time(0);
	for (cl_uint j = 0; j < ndevices; ++j) {
	for (int iter = 0; iter < niter; ++iter) {
	time_t time1 = time0;
	cl_event event;
	clerr = clEnqueueNDRangeKernel(queues[j],
	kernel,
	1,
	0,
	&n,
	0,
	0,
	0,
	&event);
	clerr = clWaitForEvents(1,
	&event);
	time0 = time(0);
	times[j] += time0 - time1;
	}
	}
	{
	for (int iter = 0; iter < niter; ++iter) {
	time_t time1 = time0;
	#pragma omp parallel for
	for (ptrdiff_t j = 0; j < n; ++j) {
	c[j] += a[j] * b[j];
	}
	time0 = time(0);
	times[ndevices] += time0 - time1;
	}
	}
	std::cout << "Intra-Device Accesses" << std::endl;
	for (cl_uint j = 0; j < ndevices + 1; ++j) {
	std::cout << times[j] << "\tseconds" << std::endl;
	double gflops = 3.0e-9 * n * niter / times[j];
	std::cout << gflops << "\tGFLOPS" << std::endl;
	double gbs = 3.0e-9 * n * niter * sizeof(float) / times[j];
	std::cout << gbs << "\tGB/s" << std::endl;
	}
	double sum = 0.0;
	for (ptrdiff_t j = 0; j < n; ++j) {
	sum += c[j];
	}
	std::cout << sum << std::endl;
	std::cout << 2.0 * n * niter * (ndevices + 1) * 2 << std::endl;
	}
	std::string wait;
	std::cin >> wait;
	return 0;
	}