Last active
May 27, 2023 04:51
-
-
Save yohanesgultom/b7e32f7649ac39e00ad65bcb83dfd72e to your computer and use it in GitHub Desktop.
Simple CUDA and OpenCL code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Simple CUDA and OpenCL code | |
Compilation: | |
* CUDA (*.cu): nvcc filename.cu | |
* CUDA + CUBLAS (*.cu): nvcc filename.cu -lcublas | |
* OpenCL (*.c): gcc filename.c -lOpenCL |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// device_query.c | |
// [email protected] | |
// Original source: | |
// * http://stackoverflow.com/questions/17240071/what-is-the-right-way-to-call-clgetplatforminfo | |
// * Banger, R, Bhattacharyya .K. "OpenCL Programming by Example". 2013. Packt publishing. p43 | |
#include <stdio.h> | |
#include <stdlib.h> | |
#ifdef __APPLE__ | |
#include <OpenCL/cl.h> | |
#else | |
#include <CL/cl.h> | |
#endif | |
#define NELEMS(x) (sizeof(x) / sizeof((x)[0])) | |
const cl_platform_info attributeTypes[5] = { | |
CL_PLATFORM_NAME, | |
CL_PLATFORM_VENDOR, | |
CL_PLATFORM_VERSION, | |
CL_PLATFORM_PROFILE, | |
CL_PLATFORM_EXTENSIONS | |
}; | |
const char* const attributeNames[] = { | |
"CL_PLATFORM_NAME", | |
"CL_PLATFORM_VENDOR", | |
"CL_PLATFORM_VERSION", | |
"CL_PLATFORM_PROFILE", | |
"CL_PLATFORM_EXTENSIONS" | |
}; | |
void PrintDeviceInfo(cl_device_id device) | |
{ | |
char queryBuffer[1024]; | |
int queryInt; | |
cl_int clError; | |
clError = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(queryBuffer), &queryBuffer, NULL); | |
printf(" CL_DEVICE_NAME: %s\n", queryBuffer); | |
queryBuffer[0] = '\0'; | |
clError = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(queryBuffer), &queryBuffer, NULL); | |
printf(" CL_DEVICE_VENDOR: %s\n", queryBuffer); | |
queryBuffer[0] = '\0'; | |
clError = clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(queryBuffer), &queryBuffer, NULL); | |
printf(" CL_DRIVER_VERSION: %s\n", queryBuffer); | |
queryBuffer[0] = '\0'; | |
clError = clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(queryBuffer), &queryBuffer, NULL); | |
printf(" CL_DEVICE_VERSION: %s\n", queryBuffer); | |
queryBuffer[0] = '\0'; | |
clError = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &queryInt, NULL); | |
printf(" CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", queryInt); | |
} | |
int main(void) { | |
int i, j, k, num_attributes; | |
char* info; | |
cl_platform_id * platforms = NULL; | |
cl_uint num_platforms; | |
cl_device_id *device_list = NULL; | |
cl_uint num_devices; | |
cl_int clStatus; | |
size_t infoSize; | |
// Get platform and device information | |
clStatus = clGetPlatformIDs(0, NULL, &num_platforms); | |
platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms); | |
clStatus = clGetPlatformIDs(num_platforms, platforms, NULL); | |
// for each platform print all attributes | |
num_attributes = NELEMS(attributeTypes); | |
// printf("\nAttribute Count = %d ", num_attributes); | |
for (i = 0; i < num_platforms; i++) { | |
printf("Platform - %d\n", i+1); | |
for (j = 0; j < num_attributes; j++) { | |
// get platform attribute value size | |
clGetPlatformInfo(platforms[i], attributeTypes[j], 0, NULL, &infoSize); | |
info = (char*) malloc(infoSize); | |
// get platform attribute value | |
clGetPlatformInfo(platforms[i], attributeTypes[j], infoSize, info, NULL); | |
printf(" %d.%d %-11s: %s\n", i+1, j+1, attributeNames[j], info); | |
} | |
//Get the devices list and choose the device you want to run on | |
clStatus = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); | |
device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices); | |
clStatus = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL); | |
for (k = 0; k < num_devices; k++) { | |
printf(" Device - %d:\n", (k+1)); | |
PrintDeviceInfo(device_list[k]); | |
} | |
} | |
free(platforms); | |
// free(device_list); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__kernel void matrixMul(__global float* C, __global float* A, __global float* B, int width) | |
{ | |
// 2D Thread ID | |
int tx = get_global_id(0); | |
int ty = get_global_id(1); | |
// value stores the element that is | |
// computed by the thread | |
float value = 0; | |
int i = 0; | |
for (i = 0; i < width; ++i) | |
{ | |
value += A[ty * width + i] * B[i * width + tx]; | |
} | |
// Write the matrix to device memory each | |
// thread writes one element | |
C[ty * width + tx] = value; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Perkalian matriks persegi | |
* Source: http://gpgpu-computing4.blogspot.co.id/2009/09/matrix-multiplication-2-opencl.html | |
**/ | |
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <math.h> | |
#ifdef __APPLE__ | |
#include <OpenCL/cl.h> | |
#else | |
#include <CL/cl.h> | |
#endif | |
#define WIDTH 1024 // ukuran baris matriks | |
#define TILE_SIZE 16 // ukuran baris submatriks | |
#define MAX_SOURCE_SIZE (0x100000) | |
char *oclLoadProgSource(char *fileName, char *comment, size_t *source_size) | |
{ | |
/* Load the source code containing the kernel*/ | |
FILE *fp = fopen(fileName, "r"); | |
if (!fp) { | |
fprintf(stderr, "Failed to load kernel.\n"); | |
exit(1); | |
} | |
char *source_str = (char*)malloc(MAX_SOURCE_SIZE); | |
*source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); | |
fclose(fp); | |
return source_str; | |
} | |
void randomInit(float* data, int size) | |
{ | |
int i = 0; | |
for (i = 0; i < size; ++i) | |
data[i] = rand() / (float)RAND_MAX; | |
} | |
void validateMatrixMul(float* C, float* A, float* B, int width) { | |
int i, j, k = 0; | |
float sum = .0f; | |
for (i = 0; i < width; i++) { | |
for (j = 0; j < width; j++) { | |
sum = .0f; | |
for (k = 0; k < width; k++) { | |
sum = sum + A[i*width+k] * B[k*width+j]; | |
} | |
if (fabs(C[i*width+j] - sum) > 1e-3) | |
{ | |
fprintf(stderr, "Result verification failed at element %d!\n", i*width+j); | |
exit(EXIT_FAILURE); | |
} | |
} | |
} | |
} | |
int main(void) | |
{ | |
// Isi sesuai dengan indeks platform yang ingin digunakan | |
// Indeks berdasarkan hasil device_query.c | |
int platformId = 0; | |
int deviceId = 0; | |
// alokasi memory variable di host | |
unsigned int size = WIDTH * WIDTH; | |
unsigned int mem_size = sizeof(float) * size; | |
float* h_A = (float*) malloc(mem_size); | |
float* h_B = (float*) malloc(mem_size); | |
float* h_C = (float*) malloc(mem_size); | |
// inisialisasi acak | |
randomInit(h_A, size); | |
randomInit(h_B, size); | |
cl_int clStatus; | |
// Ambil list platforms | |
cl_uint num_platforms; | |
clGetPlatformIDs(0, NULL, &num_platforms); | |
cl_platform_id *platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id)*num_platforms); | |
clGetPlatformIDs(num_platforms, platforms, NULL); | |
// Pakai platform sesuai platformId | |
cl_platform_id cpPlatform = platforms[platformId]; | |
// Ambil list devices | |
cl_uint num_devices; | |
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); | |
cl_device_id *device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices); | |
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL); | |
// Pakai device sesuai deviceId | |
cl_device_id cdDevice = device_list[deviceId]; | |
// Buat context | |
cl_context cxGPUContext = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &clStatus); | |
// Buat command queue (OpenCL < 2.0) | |
cl_command_queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &clStatus); | |
// Buat command-queue (OpenCL >= 2.0) | |
// cl_command_queue cqCommandQueue = clCreateCommandQueueWithProperties(cxGPUContext, cdDevice, 0, &clStatus); | |
// Setup device memory | |
cl_mem d_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size, NULL, &clStatus); | |
cl_mem d_B = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size, NULL, &clStatus); | |
cl_mem d_C = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, mem_size, NULL, &clStatus); | |
// Tulis (salin) memory data dari host ke device | |
clEnqueueWriteBuffer(cqCommandQueue, d_A, CL_FALSE, 0, sizeof(cl_float) * size, h_A, 0, NULL, NULL); | |
clEnqueueWriteBuffer(cqCommandQueue, d_B, CL_FALSE, 0, sizeof(cl_float) * size, h_B, 0, NULL, NULL); | |
// baca kernel dari file eksternal dan buat program | |
size_t szKernelLength; | |
char *cSourceCL = oclLoadProgSource("mmul.cl", "// My comment\n", &szKernelLength); | |
cl_program clProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &clStatus); | |
clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); | |
cl_kernel clKernel = clCreateKernel(clProgram, "matrixMul", &clStatus); | |
// tentukan argumen kernel | |
int w = WIDTH; | |
clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); | |
clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); | |
clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); | |
clSetKernelArg(clKernel, 3, sizeof(cl_int), (void *)&w); | |
// jalankan kernel | |
size_t localWorkSize[] = {TILE_SIZE, TILE_SIZE}; // ukuran work-group (block) | |
size_t globalWorkSize[] = {WIDTH, WIDTH}; // jumlah seluruh work-items (threads) | |
clEnqueueNDRangeKernel(cqCommandQueue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); | |
// salin hasil dari memory device | |
clEnqueueReadBuffer(cqCommandQueue, d_C, CL_TRUE, 0, mem_size, h_C, 0, NULL, NULL); | |
// dealokasi objek-objek OpenCL | |
clReleaseMemObject(d_A); | |
clReleaseMemObject(d_C); | |
clReleaseMemObject(d_B); | |
clReleaseContext(cxGPUContext); | |
clReleaseKernel(clKernel); | |
clReleaseProgram(clProgram); | |
if(cqCommandQueue) { | |
clFlush(cqCommandQueue); | |
clFinish(cqCommandQueue); | |
} | |
// validasi | |
// validateMatrixMul(h_C, h_A, h_B, WIDTH); | |
// printf("Test PASSED\n"); | |
// dealokasi matriks | |
free(h_A); | |
free(h_B); | |
free(h_C); | |
free(device_list); | |
free(platforms); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Perkalian paralel matriks bujur sangkar dengan CUBLAS | |
* | |
* Referensi: https://raw.githubusercontent.com/sol-prog/cuda_cublas_curand_thrust/master/mmul_1.cu | |
* | |
**/ | |
#include <stdio.h> | |
#include <cublas_v2.h> | |
#define WIDTH 1024 | |
void randomInit(float* data, int size) | |
{ | |
for (int i = 0; i < size; ++i) | |
data[i] = rand() / (float)RAND_MAX; | |
} | |
void validateMatrixMul(float* C, float* A, float* B, int width) { | |
int i, j, k = 0; | |
float sum = .0f; | |
for (i = 0; i < width; i++) { | |
for (j = 0; j < width; j++) { | |
sum = .0f; | |
for (k = 0; k < width; k++) { | |
sum = sum + A[i*width+k] * B[k*width+j]; | |
} | |
if (fabs(C[i*width+j] - sum) > 1e-3) | |
{ | |
fprintf(stderr, "Result verification failed at element %d!\n", i*width+j); | |
exit(EXIT_FAILURE); | |
} | |
} | |
} | |
} | |
int main() { | |
// Alokasi variable di memory host | |
unsigned int size = WIDTH * WIDTH; | |
unsigned int mem_size = sizeof(float) * size; | |
float* h_A = (float*) malloc(mem_size); | |
float* h_B = (float*) malloc(mem_size); | |
float* h_C = (float*) malloc(mem_size); | |
// inisalisasi acak | |
randomInit(h_A, size); | |
randomInit(h_B, size); | |
// Alokasi variable di memory device | |
float *d_A, *d_B, *d_C; | |
cudaMalloc(&d_A,WIDTH * WIDTH * sizeof(float)); | |
cudaMalloc(&d_B,WIDTH * WIDTH * sizeof(float)); | |
cudaMalloc(&d_C,WIDTH * WIDTH * sizeof(float)); | |
// Salin variable dari memory host ke device | |
cudaMemcpy(d_A,h_A,WIDTH * WIDTH * sizeof(float),cudaMemcpyHostToDevice); | |
cudaMemcpy(d_B,h_B,WIDTH * WIDTH * sizeof(float),cudaMemcpyHostToDevice); | |
// Eksekusi perkalian matriks | |
const float alf = 1.0f; | |
const float bet = 0.0f; | |
const float *alpha = &alf; | |
const float *beta = &bet; | |
cublasHandle_t handle; | |
cublasCreate(&handle); | |
// Catatan: Posisi d_A dan d_B positions ditukar karena kita menggunakan row-major format https://ipfs.io/ipfs/QmXoypizjW3WknFiJnKLwHCnL72vedxjQkDDP1mXWo6uco/wiki/Row-major_order.html | |
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, WIDTH, WIDTH, alpha, d_B, WIDTH, d_A, WIDTH, beta, d_C, WIDTH); | |
// Salin variable hasil dari memory device ke host | |
cudaMemcpy(h_C,d_C,WIDTH * WIDTH * sizeof(float),cudaMemcpyDeviceToHost); | |
// Dealokasi memory device | |
cudaFree(d_A); | |
cudaFree(d_B); | |
cudaFree(d_C); | |
// validateMatrixMul(h_C, h_A, h_B, WIDTH); | |
// printf("Test PASSED\n"); | |
// Dealokasi memory host | |
free(h_A); | |
free(h_B); | |
free(h_C); | |
return EXIT_SUCCESS; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Perkalian paralel matriks bujur sangkar | |
* | |
* Referensi: http://gpgpu-computing4.blogspot.co.id/2009/08/matrix-multiplication-2.html | |
* | |
**/ | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <math.h> | |
#define WIDTH 1024 // ukuran matriks | |
#define TILE_SIZE 16 // ukuran tile/submatriks | |
__global__ void matrixMul( float* C, float* A, float* B, int width) | |
{ | |
// 2D Thread ID | |
int tx = blockIdx.x * blockDim.x + threadIdx.x; | |
int ty = blockIdx.y * blockDim.y + threadIdx.y; | |
// lakukan multiplikasi untuk elemen | |
// C[tx, ty] atau C[ty * width + tx] | |
float value = 0; | |
for (int i = 0; i < width; ++i) | |
{ | |
float elementA = A[ty * width + i]; | |
float elementB = B[i * width + tx]; | |
value += elementA * elementB; | |
} | |
C[ty * width + tx] = value; | |
} | |
void randomInit(float* data, int size) | |
{ | |
for (int i = 0; i < size; ++i) | |
data[i] = rand() / (float)RAND_MAX; | |
} | |
void validateMatrixMul(float* C, float* A, float* B, int width) { | |
int i, j, k = 0; | |
float sum = .0f; | |
for (i = 0; i < width; i++) { | |
for (j = 0; j < width; j++) { | |
sum = .0f; | |
for (k = 0; k < width; k++) { | |
sum = sum + A[i*width+k] * B[k*width+j]; | |
} | |
if (fabs(C[i*width+j] - sum) > 1e-3) | |
{ | |
fprintf(stderr, "Result verification failed at element %d!\n", i*width+j); | |
exit(EXIT_FAILURE); | |
} | |
} | |
} | |
} | |
int main() | |
{ | |
// alokasi host memory | |
unsigned int size = WIDTH * WIDTH; | |
unsigned int mem_size = sizeof(float) * size; | |
float* h_A = (float*) malloc(mem_size); | |
float* h_B = (float*) malloc(mem_size); | |
float* h_C = (float*) malloc(mem_size); | |
// inisalisasi acak | |
randomInit(h_A, size); | |
randomInit(h_B, size); | |
// alokasi device memory | |
float *d_A, *d_B, *d_C; | |
cudaMalloc((void**) &d_A, mem_size); | |
cudaMalloc((void**) &d_B, mem_size); | |
cudaMalloc((void**) &d_C, mem_size); | |
// salin data ke device memory | |
cudaMemcpy(d_A, h_A, mem_size, cudaMemcpyHostToDevice); | |
cudaMemcpy(d_B, h_B, mem_size, cudaMemcpyHostToDevice); | |
// jalankan kernel | |
// dimensi block 2D = 16 * 16 threads | |
// dimensi grid 2D = 64 * 64 blocks | |
// total threads = 64 * 64 * 16 * 16 = 1048576 threads | |
dim3 blockDim(TILE_SIZE, TILE_SIZE); | |
dim3 gridDim(WIDTH / TILE_SIZE, WIDTH / TILE_SIZE); | |
matrixMul<<< gridDim, blockDim >>>(d_C, d_A, d_B, WIDTH); | |
// salin hasil dari device | |
cudaMemcpy(h_C, d_C, mem_size, cudaMemcpyDeviceToHost); | |
cudaFree(d_A); | |
cudaFree(d_B); | |
cudaFree(d_C); | |
// validasi | |
// validateMatrixMul(h_C, h_A, h_B, WIDTH); | |
// printf("Test PASSED\n"); | |
// dealokasi | |
free(h_A); | |
free(h_B); | |
free(h_C); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// SAXPY (Single precision real Alpha X plus Y) | |
// Original source: Banger, R, Bhattacharyya .K. OpenCL Programming by Example. 2013. Packt publishing | |
// By: [email protected] | |
__kernel void saxpy_kernel(float alpha, __global float *A, __global float *B, __global float *C) | |
{ | |
//Get the index of the work-item | |
int index = get_global_id(0); | |
C[index] = alpha* A[index] + B[index]; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Simplified SAXPY OpenCL | |
* Tested on: CL_PLATFORM_VERSION: OpenCL 1.2 CUDA 9.0.282 | |
*/ | |
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <math.h> | |
#ifdef __APPLE__ | |
#include <OpenCL/cl.h> | |
#else | |
#include <CL/cl.h> | |
#endif | |
#define VECTOR_SIZE 1024 | |
#define MAX_SOURCE_SIZE (0x100000) | |
char *oclLoadProgSource(char *fileName, char *comment, size_t *source_size) | |
{ | |
/* Load the source code containing the kernel*/ | |
FILE *fp = fopen(fileName, "r"); | |
if (!fp) { | |
fprintf(stderr, "Failed to load kernel.\n"); | |
exit(1); | |
} | |
char *source_str = (char*)malloc(MAX_SOURCE_SIZE); | |
*source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); | |
fclose(fp); | |
return source_str; | |
} | |
int main(void) { | |
// Isi sesuai dengan indeks platform yang ingin digunakan | |
// Indeks berdasarkan hasil device_query.c | |
int platformId = 0; | |
int deviceId = 0; | |
int i; | |
char *kernel_filename = "saxpy.cl"; | |
char *kernel_comment = "// saxpy"; | |
size_t kernelLength; | |
// Allocate space for vectors A, B and C | |
float alpha = 2.0; | |
float *A = (float*)malloc(sizeof(float)*VECTOR_SIZE); | |
float *B = (float*)malloc(sizeof(float)*VECTOR_SIZE); | |
float *C = (float*)malloc(sizeof(float)*VECTOR_SIZE); | |
for(i = 0; i < VECTOR_SIZE; i++) | |
{ | |
A[i] = i; | |
B[i] = VECTOR_SIZE - i; | |
C[i] = 0; | |
} | |
// Get platform and device information | |
cl_platform_id * platforms = NULL; | |
cl_uint num_platforms; | |
cl_device_id *device_list = NULL; | |
cl_uint num_devices; | |
cl_context context; | |
char *kernel_content = NULL; | |
//Set up the Platform | |
cl_int clStatus = clGetPlatformIDs(0, NULL, &num_platforms); | |
platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id)*num_platforms); | |
clStatus = clGetPlatformIDs(num_platforms, platforms, NULL); | |
//Get the devices list and choose the device you want to run on | |
clStatus = clGetDeviceIDs( platforms[platformId], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); | |
device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices); | |
clStatus = clGetDeviceIDs( platforms[platformId], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL); | |
// Create one OpenCL context for each device in the platform | |
context = clCreateContext( NULL, num_devices, device_list, NULL, NULL, &clStatus); | |
// Create a command queue (OpenCL < 2.0) | |
cl_command_queue command_queue = clCreateCommandQueue(context, device_list[deviceId], 0, &clStatus); | |
// Create a command queue (OpenCL >= 2.0) | |
// cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_list[deviceId], 0, &clStatus); | |
// Create memory buffers on the device for each vector | |
cl_mem A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus); | |
cl_mem B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus); | |
cl_mem C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus); | |
// Copy the Buffer A and B to the device | |
clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL); | |
clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL); | |
// Create a program from the kernel source | |
kernel_content = oclLoadProgSource(kernel_filename, kernel_comment, &kernelLength); | |
// printf("%s\n", kernel_content); | |
cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernel_content, NULL, &clStatus); | |
// Build the program | |
clStatus = clBuildProgram(program, 1, device_list, NULL, NULL, NULL); | |
// Create the OpenCL kernel | |
cl_kernel kernel = clCreateKernel(program, "saxpy_kernel", &clStatus); | |
// Set the arguments of the kernel | |
clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void *)&alpha); | |
clStatus = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&A_clmem); | |
clStatus = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&B_clmem); | |
clStatus = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&C_clmem); | |
// Execute the OpenCL kernel on the list | |
size_t global_size = VECTOR_SIZE; // Process the entire lists | |
size_t local_size = 64; | |
// Process one item at a time | |
clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); | |
// Read the cl memory C_clmem on device to the host variable C | |
clStatus = clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL); | |
// Clean up and wait for all the comands to complete. | |
clStatus = clFlush(command_queue); | |
clStatus = clFinish(command_queue); | |
// Validate result | |
// for (i = 0; i < VECTOR_SIZE; ++i) | |
// { | |
// if (fabs(alpha * A[i] + B[i] - C[i]) > 1e-5) | |
// { | |
// fprintf(stderr, "Result verification failed at element %d!\n", i); | |
// exit(EXIT_FAILURE); | |
// } | |
// } | |
// printf("Test PASSED\n"); | |
// Finally release all OpenCL allocated objects and host buffers. | |
clStatus = clReleaseKernel(kernel); | |
clStatus = clReleaseProgram(program); | |
clStatus = clReleaseMemObject(A_clmem); | |
clStatus = clReleaseMemObject(B_clmem); | |
clStatus = clReleaseMemObject(C_clmem); | |
clStatus = clReleaseCommandQueue(command_queue); | |
clStatus = clReleaseContext(context); | |
free(A); | |
free(B); | |
free(C); | |
free(platforms); | |
free(device_list); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* How to get global thread index on various grid/block indexing schemes | |
* Source: http://www.martinpeniak.com/index.php?option=com_content&view=article&catid=17:updates&id=288:cuda-thread-indexing-explained | |
* | |
*/ | |
// 1D grid of 1D blocks | |
__device__ int getGlobalIdx_1D_1D() | |
{ | |
return blockIdx.x * blockDim.x + threadIdx.x; | |
} | |
// 1D grid of 2D blocks | |
__device__ int getGlobalIdx_1D_2D() | |
{ | |
return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; | |
} | |
// 1D grid of 3D blocks | |
__device__ int getGlobalIdx_1D_3D() | |
{ | |
return blockIdx.x * blockDim.x * blockDim.y * blockDim.z | |
+ threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x; | |
} | |
// 2D grid of 1D blocks | |
__device__ int getGlobalIdx_2D_1D() | |
{ | |
int blockId = blockIdx.y * gridDim.x + blockIdx.x; | |
int threadId = blockId * blockDim.x + threadIdx.x; | |
return threadId; | |
} | |
// 2D grid of 2D blocks | |
__device__ int getGlobalIdx_2D_2D() | |
{ | |
int blockId = blockIdx.x + blockIdx.y * gridDim.x; | |
int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x; | |
return threadId; | |
} | |
// 2D grid of 3D blocks | |
__device__ int getGlobalIdx_2D_3D() | |
{ | |
int blockId = blockIdx.x | |
+ blockIdx.y * gridDim.x; | |
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z) | |
+ (threadIdx.z * (blockDim.x * blockDim.y)) | |
+ (threadIdx.y * blockDim.x) | |
+ threadIdx.x; | |
return threadId; | |
} | |
// 3D grid of 1D blocks | |
__device__ int getGlobalIdx_3D_1D() | |
{ | |
int blockId = blockIdx.x | |
+ blockIdx.y * gridDim.x | |
+ gridDim.x * gridDim.y * blockIdx.z; | |
int threadId = blockId * blockDim.x + threadIdx.x; | |
return threadId; | |
} | |
// 3D grid of 2D blocks | |
__device__ int getGlobalIdx_3D_2D() | |
{ | |
int blockId = blockIdx.x | |
+ blockIdx.y * gridDim.x | |
+ gridDim.x * gridDim.y * blockIdx.z; | |
int threadId = blockId * (blockDim.x * blockDim.y) | |
+ (threadIdx.y * blockDim.x) | |
+ threadIdx.x; | |
return threadId; | |
} | |
// 3D grid of 3D blocks | |
__device__ int getGlobalIdx_3D_3D() | |
{ | |
int blockId = blockIdx.x | |
+ blockIdx.y * gridDim.x | |
+ gridDim.x * gridDim.y * blockIdx.z; | |
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z) | |
+ (threadIdx.z * (blockDim.x * blockDim.y)) | |
+ (threadIdx.y * blockDim.x) | |
+ threadIdx.x; | |
return threadId; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Vector addition: C = A + B. | |
* Serial CPU execution | |
*/ | |
#include <stdio.h> | |
#include <stdlib.h> | |
int main(void) | |
{ | |
// ukuran/total elemen vektor | |
int numElements = 50000; | |
size_t size = numElements * sizeof(float); | |
float *h_A = (float *)malloc(size); | |
float *h_B = (float *)malloc(size); | |
float *h_C = (float *)malloc(size); | |
for (int i = 0; i < numElements; ++i) | |
{ | |
h_A[i] = rand()/(float)RAND_MAX; | |
h_B[i] = rand()/(float)RAND_MAX; | |
} | |
for (int i = 0; i < numElements; ++i) | |
{ | |
h_C[i] = h_A[i] + h_B[i]; | |
} | |
free(h_A); | |
free(h_B); | |
free(h_C); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int iNumElements) | |
{ | |
// ambil indeks global work-item (thread) | |
int iGID = get_global_id(0); | |
// jumlah work-items (threads) bisa melebihi iNumElements | |
if (iGID < iNumElements) | |
{ | |
// jumlahkan elemen vektor ke iGID | |
c[iGID] = a[iGID] + b[iGID]; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Penjumlahan vektor | |
* | |
* Tested on CL_PLATFORM_VERSION: OpenCL 1.2 CUDA 9.0.282 | |
*/ | |
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <math.h> | |
#ifdef __APPLE__ | |
#include <OpenCL/cl.h> | |
#else | |
#include <CL/cl.h> | |
#endif | |
#define NUM_ELEMENTS 50000 | |
#define MAX_SOURCE_SIZE (0x100000) | |
char *oclLoadProgSource(char *fileName, char *comment, size_t *source_size) | |
{ | |
/* Load the source code containing the kernel*/ | |
FILE *fp = fopen(fileName, "r"); | |
if (!fp) { | |
fprintf(stderr, "Failed to load kernel.\n"); | |
exit(1); | |
} | |
char *source_str = (char*)malloc(MAX_SOURCE_SIZE); | |
*source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); | |
fclose(fp); | |
return source_str; | |
} | |
int main(void) | |
{ | |
// Isi sesuai dengan indeks platform yang ingin digunakan | |
// Indeks berdasarkan hasil device_query.c | |
int platformId = 0; | |
int deviceId = 0; | |
int i = 0; | |
int iNumElements = NUM_ELEMENTS; | |
// Alokasi dan inisialisi variable di memory host | |
float *srcA = (float *)malloc(sizeof(float) * iNumElements); | |
float *srcB = (float *)malloc(sizeof(float) * iNumElements); | |
float *dst = (float *)malloc(sizeof(float) * iNumElements); | |
i = 0; | |
for (i = 0; i < iNumElements; ++i) | |
{ | |
srcA[i] = rand()/(float)RAND_MAX; | |
srcB[i] = rand()/(float)RAND_MAX; | |
} | |
cl_int clStatus; | |
// Ambil list platforms | |
cl_uint num_platforms; | |
clGetPlatformIDs(0, NULL, &num_platforms); | |
cl_platform_id *platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id)*num_platforms); | |
clGetPlatformIDs(num_platforms, platforms, NULL); | |
// Pakai platform sesuai platformId | |
cl_platform_id cpPlatform = platforms[platformId]; | |
// Ambil list devices | |
cl_uint num_devices; | |
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); | |
cl_device_id *device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices); | |
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL); | |
// Pakai device sesuai deviceId | |
cl_device_id cdDevice = device_list[deviceId]; | |
// Buat context | |
cl_context cxGPUContext = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &clStatus); | |
// Buat command queue (OpenCL < 2.0) | |
cl_command_queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &clStatus); | |
// Buat command-queue (OpenCL >= 2.0) | |
// cl_command_queue cqCommandQueue = clCreateCommandQueueWithProperties(cxGPUContext, cdDevice, 0, &clStatus); | |
// Alokasi memory di device | |
cl_mem cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(float) * iNumElements, NULL, &clStatus); | |
cl_mem cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(float) * iNumElements, NULL, &clStatus); | |
cl_mem cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(float) * iNumElements, NULL, &clStatus); | |
// Tulis (salin) memory data dari host ke device | |
clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * iNumElements, srcA, 0, NULL, NULL); | |
clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * iNumElements, srcB, 0, NULL, NULL); | |
// Buat program dan build dari fungsi kernel | |
size_t szKernelLength; | |
char *cSourceCL = oclLoadProgSource("vectorAdd.cl", "// My comment\n", &szKernelLength); | |
cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &clStatus); | |
clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); | |
// Buat kernel | |
cl_kernel ckKernel = clCreateKernel(cpProgram, "VectorAdd", &clStatus); | |
// Tentukan argumen kernel | |
clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); | |
clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); | |
clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); | |
clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); | |
// Jalankan kernel | |
size_t szLocalWorkSize = 256; // ukuran work-group (block) | |
size_t szGlobalWorkSize = iNumElements; // jumlah seluruh work-items (threads) | |
clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); | |
// Baca (salin) memory hasil dari device kembali ke host | |
clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * iNumElements, dst, 0, NULL, NULL); | |
// Validasi hasil | |
// i = 0; | |
// for (i = 0; i < iNumElements; i++) { | |
// if (fabs(srcA[i] + srcB[i] - dst[i]) > 1e5) { | |
// fprintf(stderr, "Result verification failed at element %d!\n", i); | |
// exit(EXIT_FAILURE); | |
// } | |
// } | |
// printf("Test PASSED\n"); | |
// Dealokasi objek openCL | |
if(ckKernel)clReleaseKernel(ckKernel); | |
if(cpProgram)clReleaseProgram(cpProgram); | |
if(cqCommandQueue) { | |
clStatus = clFlush(cqCommandQueue); | |
clStatus = clFinish(cqCommandQueue); | |
} | |
if(cxGPUContext)clReleaseContext(cxGPUContext); | |
// Dealokasi memory device | |
if(cmDevSrcA)clReleaseMemObject(cmDevSrcA); | |
if(cmDevSrcB)clReleaseMemObject(cmDevSrcB); | |
if(cmDevDst)clReleaseMemObject(cmDevDst); | |
// Dealokasi memory host | |
free(srcA); | |
free(srcB); | |
free(dst); | |
free(device_list); | |
free(platforms); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved. | |
* Vector addition: C = A + B. | |
* | |
* This sample is a very basic sample that implements element by element | |
* vector addition. It is the same as the sample illustrating Chapter 2 | |
* of the programming guide with some additions like error checking. | |
*/ | |
#include <stdio.h> | |
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) | |
{ | |
// jika menggunakan indeks 2D, akan terdapat atribut x, y | |
// jika menggunakan indeks 3D, akan terdapat atribut x, y, z | |
int i = blockDim.x * blockIdx.x + threadIdx.x; | |
// karena jumlah thread yang berjalan dapat >= total elemen | |
if (i < numElements) | |
{ | |
C[i] = A[i] + B[i]; | |
} | |
} | |
int main(void) | |
{ | |
// ukuran/total elemen vektor | |
int numElements = 50000; | |
size_t size = numElements * sizeof(float); | |
float *h_A = (float *)malloc(size); | |
float *h_B = (float *)malloc(size); | |
float *h_C = (float *)malloc(size); | |
float *d_A, *d_B, *d_C; | |
cudaMalloc((void **)&d_A, size); | |
cudaMalloc((void **)&d_B, size); | |
cudaMalloc((void **)&d_C, size); | |
for (int i = 0; i < numElements; ++i) | |
{ | |
h_A[i] = rand()/(float)RAND_MAX; | |
h_B[i] = rand()/(float)RAND_MAX; | |
} | |
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); | |
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); | |
int threadsPerBlock = 256; | |
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; | |
// (50000 + 256 - 1) / 256 = 196 blocks/grid | |
// jadi ada 50176 threads yang akan dijalankan, yaitu lebih dari total elemen | |
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements); | |
// // alternatif | |
// dim3 gridDim(blocksPerGrid); | |
// dim3 blockDim(threadsPerBlock); | |
// vectorAdd<<<gridDim, blockDim>>>(d_A, d_B, d_C, numElements); | |
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); | |
// // validasi hasil | |
// for (int i = 0; i < numElements; ++i) | |
// { | |
// if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) | |
// { | |
// fprintf(stderr, "Result verification failed at element %d!\n", i); | |
// exit(EXIT_FAILURE); | |
// } | |
// } | |
// printf("Test PASSED\n"); | |
cudaFree(d_A); | |
cudaFree(d_B); | |
cudaFree(d_C); | |
free(h_A); | |
free(h_B); | |
free(h_C); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment