Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save morris821028/1ee07f52d494217ae26933d352c7f07f to your computer and use it in GitHub Desktop.
Save morris821028/1ee07f52d494217ae26933d352c7f07f to your computer and use it in GitHub Desktop.
Judge Girl 10091. Fast Matrix Multiplication (OpenCL)
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#define MAXGPU 8
#define MAXN 2048
uint32_t hostA[MAXN*MAXN], hostB[MAXN*MAXN], hostC[MAXN*MAXN];
int N = MAXN;
char clSrcFormat[1024] =
"#define MAXR %d\n"
"#define MAXC %d\n"
"#define CTYPE unsigned int\n"
"\n"
"__kernel void matrixMul(__global CTYPE *in1, \n"
" __global CTYPE *in2, \n"
" __global CTYPE *out) { \n"
" CTYPE rbuf[MAXC]; \n"
" int r = get_global_id(0); \n"
" int localID = get_local_id(0); \n"
" int localSz = get_local_size(0); \n"
" __local CTYPE cbuf[MAXC]; \n"
" for (int i = 0; i < MAXC; i++) \n"
" rbuf[i] = in1[r * MAXR + i]; \n"
" for (int c = 0; c < MAXC; c++) { \n"
" for (int cr = localID; cr < MAXC; cr += localSz) \n"
" cbuf[cr] = in2[cr * MAXC + c]; \n"
" barrier(CLK_LOCAL_MEM_FENCE); \n"
" CTYPE sum = 0; \n"
" for (int k = 0; k < MAXC; k++) \n"
" sum += rbuf[k] * cbuf[k]; \n"
" out[r * MAXC + c] = sum; \n"
" } \n"
"}\n";
char clSrc[1024] = "";
char clSrcMain[1024] = "matrixMul";
// -- start working with OpenCL
cl_context clCtx;
cl_program clPrg;
cl_kernel clKrn;
cl_command_queue clQue;
cl_mem clMemIn1, clMemIn2, clMemOut;
#define FailAndExit destroyGPU(clCtx, clPrg, clKrn, clQue, clMemIn1, clMemIn2, clMemOut)
#define clFuncArgs cl_context *clCtx, cl_program *clPrg, cl_kernel *clKrn, \
cl_command_queue *clQue, cl_mem *clMemIn1, cl_mem *clMemIn2, \
cl_mem *clMemOut
#define clCallFunc &clCtx, &clPrg, &clKrn, &clQue, &clMemIn1, &clMemIn2, &clMemOut
void destroyGPU(clFuncArgs) {
fprintf(stderr, "Starting Cleanup ...\n\n");
if (*clMemOut) clReleaseMemObject(*clMemOut);
if (*clMemIn2) clReleaseMemObject(*clMemIn2);
if (*clMemIn1) clReleaseMemObject(*clMemIn1);
if (*clKrn) clReleaseKernel(*clKrn);
if (*clPrg) clReleaseProgram(*clPrg);
if (*clQue) clReleaseCommandQueue(*clQue);
if (*clCtx) clReleaseContext(*clCtx);
exit(0);
}
int initAllGPU(clFuncArgs) {
sprintf(clSrc, clSrcFormat, N, N);
// fprintf(stderr, "%s\n", clSrc);
cl_int clStat;
cl_uint clPlatN, clGPUN;
cl_platform_id clPlatID;
cl_device_id clGPUID[MAXGPU];
const char *clSrcPtr = clSrc;
size_t clSrcLen = (size_t) strlen(clSrc);
// -- basic OpenCL setup
clGetPlatformIDs(1, &clPlatID, &clPlatN);
clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, &clGPUN);
*clCtx = clCreateContext(NULL, clGPUN, clGPUID, NULL, NULL, &clStat);
if (clStat != CL_SUCCESS) {
printf("Error in clCreateContext, Line %u in file %s\n\n", __LINE__, __FILE__);
FailAndExit;
}
*clQue = clCreateCommandQueue(*clCtx, clGPUID[0], 0, &clStat);
if (clStat != CL_SUCCESS) {
printf("Error in clCreateCommandQueue, Line %u in file %s\n\n", __LINE__, __FILE__);
FailAndExit;
}
*clPrg = clCreateProgramWithSource(*clCtx, 1, &clSrcPtr, &clSrcLen, &clStat);
if (clStat != CL_SUCCESS) {
printf("Error in clCreateProgramWithSource, Line %u in file %s\n\n", __LINE__, __FILE__);
FailAndExit;
}
clStat = clBuildProgram(*clPrg, 1, clGPUID, NULL, NULL, NULL);
if (clStat != CL_SUCCESS) {
printf("Error in clBuildProgram, Line %u in file %s\n\n", __LINE__, __FILE__);
size_t log_size;
clGetProgramBuildInfo(*clPrg, clGPUID[0], CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status), &clStat, NULL);
clGetProgramBuildInfo(*clPrg, clGPUID[0],
CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *program_log = (char *) calloc(log_size+1, sizeof(char));
clGetProgramBuildInfo(*clPrg, clGPUID[0],
CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
printf("%s", program_log);
free(program_log);
FailAndExit;
}
*clKrn = clCreateKernel(*clPrg, clSrcMain, &clStat);
if (clStat != CL_SUCCESS) {
printf("Error in clCreateKernel, Line %u in file %s\n\n", __LINE__, __FILE__);
FailAndExit;
}
// -- create all buffers
cl_mem_flags clInBuffFlag = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR;
cl_mem_flags clOutBuffFlag = CL_MEM_WRITE_ONLY;
*clMemIn1 = clCreateBuffer(*clCtx, clInBuffFlag, sizeof(uint32_t)*N*N,
hostA, &clStat);
if (clStat != CL_SUCCESS) {
printf("Error in clCreateBuffer, Line %u in file %s\n\n", __LINE__, __FILE__);
FailAndExit;
}
*clMemIn2 = clCreateBuffer(*clCtx, clInBuffFlag, sizeof(uint32_t)*N*N,
hostB, &clStat);
if (clStat != CL_SUCCESS) {
printf("Error in clCreateBuffer, Line %u in file %s\n\n", __LINE__, __FILE__);
FailAndExit;
}
*clMemOut = clCreateBuffer(*clCtx, clOutBuffFlag, sizeof(uint32_t)*N*N,
hostC, &clStat);
if (clStat != CL_SUCCESS) {
printf("Error in clCreateBuffer, Line %u in file %s\n\n", __LINE__, __FILE__);
FailAndExit;
}
// -- set argument to kernel
clStat = clSetKernelArg(*clKrn, 0, sizeof(cl_mem), (void *) clMemIn1);
if (clStat != CL_SUCCESS) {
printf("Error in clSetKernelArg, Line %u in file %s\n\n", __LINE__, __FILE__);
FailAndExit;
}
clStat = clSetKernelArg(*clKrn, 1, sizeof(cl_mem), (void *) clMemIn2);
if (clStat != CL_SUCCESS) {
printf("Error in clSetKernelArg, Line %u in file %s\n\n", __LINE__, __FILE__);
FailAndExit;
}
clStat = clSetKernelArg(*clKrn, 2, sizeof(cl_mem), (void *) clMemOut);
if (clStat != CL_SUCCESS) {
printf("Error in clSetKernelArg, Line %u in file %s\n\n", __LINE__, __FILE__);
FailAndExit;
}
return 1;
}
int executeGPU(clFuncArgs) {
cl_int clStat;
size_t globalOffset[] = {0, 0};
size_t globalSize[] = {N};
size_t localSize[] = {64};
clStat = clEnqueueNDRangeKernel(*clQue, *clKrn, 1, globalOffset,
globalSize, localSize, 0, NULL, NULL);
if (clStat != CL_SUCCESS) {
printf("Error in clEnqueueNDRangeKernel, Line %u in file %s\n\n", __LINE__, __FILE__);
FailAndExit;
}
clFinish(*clQue);
// -- read back
clEnqueueReadBuffer(*clQue, *clMemOut, CL_TRUE, 0, sizeof(uint32_t)*N*N,
hostC, 0, NULL, NULL);
return 1;
}
void readIn() {
uint32_t c1, c2;
scanf("%d %u %u", &N, &c1, &c2);
uint32_t x = 2, n = N*N;
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c1 + i + j)%n;
hostA[i*N+j] = x;
}
}
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c2 + i + j)%n;
hostB[i*N+j] = x;
}
}
}
void writeOut() {
uint32_t h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = (h + hostC[i*N+j]) * 2654435761LU;
}
printf("%u\n", h);
}
void onStart() {
readIn();
initAllGPU(clCallFunc);
executeGPU(clCallFunc);
writeOut();
destroyGPU(clCallFunc);
}
void sigHandler(int signo) {
printf("God Bless Me");
destroyGPU(clCallFunc);
exit(0);
}
int main(int argc, char *argv[]) {
const char sigErr[] = "I can't catch signal.\n";
if (signal(SIGTRAP, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGSEGV, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGFPE, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGKILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGINT, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
onStart();
return 0;
}
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#define MAXGPU 8
#define MAXN 2048
uint32_t hostA[MAXN*MAXN], hostB[MAXN*MAXN], hostC[MAXN*MAXN];
int N = MAXN;
char clSrcFormat[1024] =
"#define MAXR %d\n"
"#define MAXC %d\n"
"#define CTYPE unsigned int\n"
"\n"
"__kernel void matrixMul(__global CTYPE *in1, \n"
" __global CTYPE *in2, \n"
" __global CTYPE *out) { \n"
" CTYPE rbuf[MAXC]; \n"
" int r = get_global_id(0); \n"
" int localID = get_local_id(0); \n"
" int localSz = get_local_size(0); \n"
" __local CTYPE cbuf[MAXC]; \n"
" for (int i = 0; i < MAXC; i++) \n"
" rbuf[i] = in1[r * MAXR + i]; \n"
" for (int c = 0; c < MAXC; c++) { \n"
" for (int cr = localID; cr < MAXC; cr += localSz) \n"
" cbuf[cr] = in2[cr * MAXC + c]; \n"
" barrier(CLK_LOCAL_MEM_FENCE); \n"
" CTYPE sum = 0; \n"
" for (int k = 0; k < MAXC; k++) \n"
" sum += rbuf[k] * cbuf[k]; \n"
" out[r * MAXC + c] = sum; \n"
" } \n"
"}\n";
char clSrc[1024] = "";
char clSrcMain[1024] = "matrixMul";
// -- start working with OpenCL
cl_context clCtx;
cl_program clPrg;
cl_kernel clKrn;
cl_command_queue clQue;
cl_mem clMemIn1, clMemIn2, clMemOut;
#define FailAndExit destroyGPU(clCtx, clPrg, clKrn, clQue, clMemIn1, clMemIn2, clMemOut)
#define clFuncArgs cl_context *clCtx, cl_program *clPrg, cl_kernel *clKrn, \
cl_command_queue *clQue, cl_mem *clMemIn1, cl_mem *clMemIn2, \
cl_mem *clMemOut
#define clCallFunc &clCtx, &clPrg, &clKrn, &clQue, &clMemIn1, &clMemIn2, &clMemOut
void destroyGPU(clFuncArgs) {
if (*clMemOut) clReleaseMemObject(*clMemOut);
if (*clMemIn2) clReleaseMemObject(*clMemIn2);
if (*clMemIn1) clReleaseMemObject(*clMemIn1);
if (*clKrn) clReleaseKernel(*clKrn);
if (*clPrg) clReleaseProgram(*clPrg);
if (*clQue) clReleaseCommandQueue(*clQue);
if (*clCtx) clReleaseContext(*clCtx);
exit(0);
}
int initAllGPU(clFuncArgs) {
sprintf(clSrc, clSrcFormat, N, N);
fprintf(stderr, "%s\n", clSrc);
exit(0);
cl_int clStat;
cl_uint clPlatN, clGPUN;
cl_platform_id clPlatID;
cl_device_id clGPUID[MAXGPU];
const char *clSrcPtr = clSrc;
size_t clSrcLen = (size_t) strlen(clSrc);
// -- basic OpenCL setup
clGetPlatformIDs(1, &clPlatID, &clPlatN);
clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, &clGPUN);
*clCtx = clCreateContext(NULL, clGPUN, clGPUID, NULL, NULL, &clStat);
*clQue = clCreateCommandQueue(*clCtx, clGPUID[0], 0, &clStat);
*clPrg = clCreateProgramWithSource(*clCtx, 1, &clSrcPtr, &clSrcLen, &clStat);
clStat = clBuildProgram(*clPrg, 1, clGPUID, NULL, NULL, NULL);
*clKrn = clCreateKernel(*clPrg, clSrcMain, &clStat);
// -- create all buffers
cl_mem_flags clInBuffFlag = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR;
cl_mem_flags clOutBuffFlag = CL_MEM_WRITE_ONLY;
*clMemIn1 = clCreateBuffer(*clCtx, clInBuffFlag, sizeof(uint32_t)*N*N,
hostA, &clStat);
*clMemIn2 = clCreateBuffer(*clCtx, clInBuffFlag, sizeof(uint32_t)*N*N,
hostB, &clStat);
*clMemOut = clCreateBuffer(*clCtx, clOutBuffFlag, sizeof(uint32_t)*N*N,
hostC, &clStat);
// -- set argument to kernel
clStat = clSetKernelArg(*clKrn, 0, sizeof(cl_mem), (void *) clMemIn1);
clStat = clSetKernelArg(*clKrn, 1, sizeof(cl_mem), (void *) clMemIn2);
clStat = clSetKernelArg(*clKrn, 2, sizeof(cl_mem), (void *) clMemOut);
return 1;
}
int executeGPU(clFuncArgs) {
cl_int clStat;
size_t globalOffset[] = {0, 0};
size_t globalSize[] = {N};
size_t localSize[] = {64};
clStat = clEnqueueNDRangeKernel(*clQue, *clKrn, 1, globalOffset,
globalSize, localSize, 0, NULL, NULL);
clFinish(*clQue);
// -- read back
clEnqueueReadBuffer(*clQue, *clMemOut, CL_TRUE, 0, sizeof(uint32_t)*N*N,
hostC, 0, NULL, NULL);
return 1;
}
void readIn() {
uint32_t c1, c2;
scanf("%d %u %u", &N, &c1, &c2);
uint32_t x = 2, n = N*N;
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c1 + i + j)%n;
hostA[i*N+j] = x;
}
}
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c2 + i + j)%n;
hostB[i*N+j] = x;
}
}
}
void writeOut() {
uint32_t h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = (h + hostC[i*N+j]) * 2654435761LU;
}
printf("%u\n", h);
}
void onStart() {
readIn();
initAllGPU(clCallFunc);
executeGPU(clCallFunc);
writeOut();
destroyGPU(clCallFunc);
}
void sigHandler(int signo) {
printf("God Bless Me");
destroyGPU(clCallFunc);
exit(0);
}
int main(int argc, char *argv[]) {
const char sigErr[] = "I can't catch signal.\n";
if (signal(SIGTRAP, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGSEGV, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGFPE, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGKILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGINT, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
onStart();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment