Last active
April 16, 2016 14:58
-
-
Save morris821028/1ee07f52d494217ae26933d352c7f07f to your computer and use it in GitHub Desktop.
Judge Girl 10091. Fast Matrix Multiplication (OpenCL)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <assert.h> | |
#include <inttypes.h> | |
#include <string.h> | |
#include <signal.h> | |
#include <unistd.h> | |
#include <CL/cl.h> | |
#define MAXGPU 8 | |
#define MAXN 2048 | |
uint32_t hostA[MAXN*MAXN], hostB[MAXN*MAXN], hostC[MAXN*MAXN]; | |
int N = MAXN; | |
char clSrcFormat[1024] = | |
"#define MAXR %d\n" | |
"#define MAXC %d\n" | |
"#define CTYPE unsigned int\n" | |
"\n" | |
"__kernel void matrixMul(__global CTYPE *in1, \n" | |
" __global CTYPE *in2, \n" | |
" __global CTYPE *out) { \n" | |
" CTYPE rbuf[MAXC]; \n" | |
" int r = get_global_id(0); \n" | |
" int localID = get_local_id(0); \n" | |
" int localSz = get_local_size(0); \n" | |
" __local CTYPE cbuf[MAXC]; \n" | |
" for (int i = 0; i < MAXC; i++) \n" | |
" rbuf[i] = in1[r * MAXR + i]; \n" | |
" for (int c = 0; c < MAXC; c++) { \n" | |
" for (int cr = localID; cr < MAXC; cr += localSz) \n" | |
" cbuf[cr] = in2[cr * MAXC + c]; \n" | |
" barrier(CLK_LOCAL_MEM_FENCE); \n" | |
" CTYPE sum = 0; \n" | |
" for (int k = 0; k < MAXC; k++) \n" | |
" sum += rbuf[k] * cbuf[k]; \n" | |
" out[r * MAXC + c] = sum; \n" | |
" } \n" | |
"}\n"; | |
char clSrc[1024] = ""; | |
char clSrcMain[1024] = "matrixMul"; | |
// -- start working with OpenCL | |
cl_context clCtx; | |
cl_program clPrg; | |
cl_kernel clKrn; | |
cl_command_queue clQue; | |
cl_mem clMemIn1, clMemIn2, clMemOut; | |
#define FailAndExit destroyGPU(clCtx, clPrg, clKrn, clQue, clMemIn1, clMemIn2, clMemOut) | |
#define clFuncArgs cl_context *clCtx, cl_program *clPrg, cl_kernel *clKrn, \ | |
cl_command_queue *clQue, cl_mem *clMemIn1, cl_mem *clMemIn2, \ | |
cl_mem *clMemOut | |
#define clCallFunc &clCtx, &clPrg, &clKrn, &clQue, &clMemIn1, &clMemIn2, &clMemOut | |
void destroyGPU(clFuncArgs) { | |
fprintf(stderr, "Starting Cleanup ...\n\n"); | |
if (*clMemOut) clReleaseMemObject(*clMemOut); | |
if (*clMemIn2) clReleaseMemObject(*clMemIn2); | |
if (*clMemIn1) clReleaseMemObject(*clMemIn1); | |
if (*clKrn) clReleaseKernel(*clKrn); | |
if (*clPrg) clReleaseProgram(*clPrg); | |
if (*clQue) clReleaseCommandQueue(*clQue); | |
if (*clCtx) clReleaseContext(*clCtx); | |
exit(0); | |
} | |
int initAllGPU(clFuncArgs) { | |
sprintf(clSrc, clSrcFormat, N, N); | |
// fprintf(stderr, "%s\n", clSrc); | |
cl_int clStat; | |
cl_uint clPlatN, clGPUN; | |
cl_platform_id clPlatID; | |
cl_device_id clGPUID[MAXGPU]; | |
const char *clSrcPtr = clSrc; | |
size_t clSrcLen = (size_t) strlen(clSrc); | |
// -- basic OpenCL setup | |
clGetPlatformIDs(1, &clPlatID, &clPlatN); | |
clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, &clGPUN); | |
*clCtx = clCreateContext(NULL, clGPUN, clGPUID, NULL, NULL, &clStat); | |
if (clStat != CL_SUCCESS) { | |
printf("Error in clCreateContext, Line %u in file %s\n\n", __LINE__, __FILE__); | |
FailAndExit; | |
} | |
*clQue = clCreateCommandQueue(*clCtx, clGPUID[0], 0, &clStat); | |
if (clStat != CL_SUCCESS) { | |
printf("Error in clCreateCommandQueue, Line %u in file %s\n\n", __LINE__, __FILE__); | |
FailAndExit; | |
} | |
*clPrg = clCreateProgramWithSource(*clCtx, 1, &clSrcPtr, &clSrcLen, &clStat); | |
if (clStat != CL_SUCCESS) { | |
printf("Error in clCreateProgramWithSource, Line %u in file %s\n\n", __LINE__, __FILE__); | |
FailAndExit; | |
} | |
clStat = clBuildProgram(*clPrg, 1, clGPUID, NULL, NULL, NULL); | |
if (clStat != CL_SUCCESS) { | |
printf("Error in clBuildProgram, Line %u in file %s\n\n", __LINE__, __FILE__); | |
size_t log_size; | |
clGetProgramBuildInfo(*clPrg, clGPUID[0], CL_PROGRAM_BUILD_STATUS, | |
sizeof(cl_build_status), &clStat, NULL); | |
clGetProgramBuildInfo(*clPrg, clGPUID[0], | |
CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); | |
char *program_log = (char *) calloc(log_size+1, sizeof(char)); | |
clGetProgramBuildInfo(*clPrg, clGPUID[0], | |
CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); | |
printf("%s", program_log); | |
free(program_log); | |
FailAndExit; | |
} | |
*clKrn = clCreateKernel(*clPrg, clSrcMain, &clStat); | |
if (clStat != CL_SUCCESS) { | |
printf("Error in clCreateKernel, Line %u in file %s\n\n", __LINE__, __FILE__); | |
FailAndExit; | |
} | |
// -- create all buffers | |
cl_mem_flags clInBuffFlag = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR; | |
cl_mem_flags clOutBuffFlag = CL_MEM_WRITE_ONLY; | |
*clMemIn1 = clCreateBuffer(*clCtx, clInBuffFlag, sizeof(uint32_t)*N*N, | |
hostA, &clStat); | |
if (clStat != CL_SUCCESS) { | |
printf("Error in clCreateBuffer, Line %u in file %s\n\n", __LINE__, __FILE__); | |
FailAndExit; | |
} | |
*clMemIn2 = clCreateBuffer(*clCtx, clInBuffFlag, sizeof(uint32_t)*N*N, | |
hostB, &clStat); | |
if (clStat != CL_SUCCESS) { | |
printf("Error in clCreateBuffer, Line %u in file %s\n\n", __LINE__, __FILE__); | |
FailAndExit; | |
} | |
*clMemOut = clCreateBuffer(*clCtx, clOutBuffFlag, sizeof(uint32_t)*N*N, | |
hostC, &clStat); | |
if (clStat != CL_SUCCESS) { | |
printf("Error in clCreateBuffer, Line %u in file %s\n\n", __LINE__, __FILE__); | |
FailAndExit; | |
} | |
// -- set argument to kernel | |
clStat = clSetKernelArg(*clKrn, 0, sizeof(cl_mem), (void *) clMemIn1); | |
if (clStat != CL_SUCCESS) { | |
printf("Error in clSetKernelArg, Line %u in file %s\n\n", __LINE__, __FILE__); | |
FailAndExit; | |
} | |
clStat = clSetKernelArg(*clKrn, 1, sizeof(cl_mem), (void *) clMemIn2); | |
if (clStat != CL_SUCCESS) { | |
printf("Error in clSetKernelArg, Line %u in file %s\n\n", __LINE__, __FILE__); | |
FailAndExit; | |
} | |
clStat = clSetKernelArg(*clKrn, 2, sizeof(cl_mem), (void *) clMemOut); | |
if (clStat != CL_SUCCESS) { | |
printf("Error in clSetKernelArg, Line %u in file %s\n\n", __LINE__, __FILE__); | |
FailAndExit; | |
} | |
return 1; | |
} | |
int executeGPU(clFuncArgs) { | |
cl_int clStat; | |
size_t globalOffset[] = {0, 0}; | |
size_t globalSize[] = {N}; | |
size_t localSize[] = {64}; | |
clStat = clEnqueueNDRangeKernel(*clQue, *clKrn, 1, globalOffset, | |
globalSize, localSize, 0, NULL, NULL); | |
if (clStat != CL_SUCCESS) { | |
printf("Error in clEnqueueNDRangeKernel, Line %u in file %s\n\n", __LINE__, __FILE__); | |
FailAndExit; | |
} | |
clFinish(*clQue); | |
// -- read back | |
clEnqueueReadBuffer(*clQue, *clMemOut, CL_TRUE, 0, sizeof(uint32_t)*N*N, | |
hostC, 0, NULL, NULL); | |
return 1; | |
} | |
void readIn() { | |
uint32_t c1, c2; | |
scanf("%d %u %u", &N, &c1, &c2); | |
uint32_t x = 2, n = N*N; | |
x = 2; | |
for (int i = 0; i < N; i++) { | |
for (int j = 0; j < N; j++) { | |
x = (x * x + c1 + i + j)%n; | |
hostA[i*N+j] = x; | |
} | |
} | |
x = 2; | |
for (int i = 0; i < N; i++) { | |
for (int j = 0; j < N; j++) { | |
x = (x * x + c2 + i + j)%n; | |
hostB[i*N+j] = x; | |
} | |
} | |
} | |
void writeOut() { | |
uint32_t h = 0; | |
for (int i = 0; i < N; i++) { | |
for (int j = 0; j < N; j++) | |
h = (h + hostC[i*N+j]) * 2654435761LU; | |
} | |
printf("%u\n", h); | |
} | |
void onStart() { | |
readIn(); | |
initAllGPU(clCallFunc); | |
executeGPU(clCallFunc); | |
writeOut(); | |
destroyGPU(clCallFunc); | |
} | |
void sigHandler(int signo) { | |
printf("God Bless Me"); | |
destroyGPU(clCallFunc); | |
exit(0); | |
} | |
int main(int argc, char *argv[]) { | |
const char sigErr[] = "I can't catch signal.\n"; | |
if (signal(SIGTRAP, sigHandler) == SIG_ERR) | |
fprintf(stderr, sigErr); | |
if (signal(SIGSEGV, sigHandler) == SIG_ERR) | |
fprintf(stderr, sigErr); | |
if (signal(SIGILL, sigHandler) == SIG_ERR) | |
fprintf(stderr, sigErr); | |
if (signal(SIGFPE, sigHandler) == SIG_ERR) | |
fprintf(stderr, sigErr); | |
if (signal(SIGKILL, sigHandler) == SIG_ERR) | |
fprintf(stderr, sigErr); | |
if (signal(SIGINT, sigHandler) == SIG_ERR) | |
fprintf(stderr, sigErr); | |
onStart(); | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <assert.h> | |
#include <inttypes.h> | |
#include <string.h> | |
#include <signal.h> | |
#include <unistd.h> | |
#include <CL/cl.h> | |
#define MAXGPU 8 | |
#define MAXN 2048 | |
uint32_t hostA[MAXN*MAXN], hostB[MAXN*MAXN], hostC[MAXN*MAXN]; | |
int N = MAXN; | |
char clSrcFormat[1024] = | |
"#define MAXR %d\n" | |
"#define MAXC %d\n" | |
"#define CTYPE unsigned int\n" | |
"\n" | |
"__kernel void matrixMul(__global CTYPE *in1, \n" | |
" __global CTYPE *in2, \n" | |
" __global CTYPE *out) { \n" | |
" CTYPE rbuf[MAXC]; \n" | |
" int r = get_global_id(0); \n" | |
" int localID = get_local_id(0); \n" | |
" int localSz = get_local_size(0); \n" | |
" __local CTYPE cbuf[MAXC]; \n" | |
" for (int i = 0; i < MAXC; i++) \n" | |
" rbuf[i] = in1[r * MAXR + i]; \n" | |
" for (int c = 0; c < MAXC; c++) { \n" | |
" for (int cr = localID; cr < MAXC; cr += localSz) \n" | |
" cbuf[cr] = in2[cr * MAXC + c]; \n" | |
" barrier(CLK_LOCAL_MEM_FENCE); \n" | |
" CTYPE sum = 0; \n" | |
" for (int k = 0; k < MAXC; k++) \n" | |
" sum += rbuf[k] * cbuf[k]; \n" | |
" out[r * MAXC + c] = sum; \n" | |
" } \n" | |
"}\n"; | |
char clSrc[1024] = ""; | |
char clSrcMain[1024] = "matrixMul"; | |
// -- start working with OpenCL | |
cl_context clCtx; | |
cl_program clPrg; | |
cl_kernel clKrn; | |
cl_command_queue clQue; | |
cl_mem clMemIn1, clMemIn2, clMemOut; | |
#define FailAndExit destroyGPU(clCtx, clPrg, clKrn, clQue, clMemIn1, clMemIn2, clMemOut) | |
#define clFuncArgs cl_context *clCtx, cl_program *clPrg, cl_kernel *clKrn, \ | |
cl_command_queue *clQue, cl_mem *clMemIn1, cl_mem *clMemIn2, \ | |
cl_mem *clMemOut | |
#define clCallFunc &clCtx, &clPrg, &clKrn, &clQue, &clMemIn1, &clMemIn2, &clMemOut | |
void destroyGPU(clFuncArgs) { | |
if (*clMemOut) clReleaseMemObject(*clMemOut); | |
if (*clMemIn2) clReleaseMemObject(*clMemIn2); | |
if (*clMemIn1) clReleaseMemObject(*clMemIn1); | |
if (*clKrn) clReleaseKernel(*clKrn); | |
if (*clPrg) clReleaseProgram(*clPrg); | |
if (*clQue) clReleaseCommandQueue(*clQue); | |
if (*clCtx) clReleaseContext(*clCtx); | |
exit(0); | |
} | |
int initAllGPU(clFuncArgs) { | |
sprintf(clSrc, clSrcFormat, N, N); | |
fprintf(stderr, "%s\n", clSrc); | |
exit(0); | |
cl_int clStat; | |
cl_uint clPlatN, clGPUN; | |
cl_platform_id clPlatID; | |
cl_device_id clGPUID[MAXGPU]; | |
const char *clSrcPtr = clSrc; | |
size_t clSrcLen = (size_t) strlen(clSrc); | |
// -- basic OpenCL setup | |
clGetPlatformIDs(1, &clPlatID, &clPlatN); | |
clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, &clGPUN); | |
*clCtx = clCreateContext(NULL, clGPUN, clGPUID, NULL, NULL, &clStat); | |
*clQue = clCreateCommandQueue(*clCtx, clGPUID[0], 0, &clStat); | |
*clPrg = clCreateProgramWithSource(*clCtx, 1, &clSrcPtr, &clSrcLen, &clStat); | |
clStat = clBuildProgram(*clPrg, 1, clGPUID, NULL, NULL, NULL); | |
*clKrn = clCreateKernel(*clPrg, clSrcMain, &clStat); | |
// -- create all buffers | |
cl_mem_flags clInBuffFlag = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR; | |
cl_mem_flags clOutBuffFlag = CL_MEM_WRITE_ONLY; | |
*clMemIn1 = clCreateBuffer(*clCtx, clInBuffFlag, sizeof(uint32_t)*N*N, | |
hostA, &clStat); | |
*clMemIn2 = clCreateBuffer(*clCtx, clInBuffFlag, sizeof(uint32_t)*N*N, | |
hostB, &clStat); | |
*clMemOut = clCreateBuffer(*clCtx, clOutBuffFlag, sizeof(uint32_t)*N*N, | |
hostC, &clStat); | |
// -- set argument to kernel | |
clStat = clSetKernelArg(*clKrn, 0, sizeof(cl_mem), (void *) clMemIn1); | |
clStat = clSetKernelArg(*clKrn, 1, sizeof(cl_mem), (void *) clMemIn2); | |
clStat = clSetKernelArg(*clKrn, 2, sizeof(cl_mem), (void *) clMemOut); | |
return 1; | |
} | |
int executeGPU(clFuncArgs) { | |
cl_int clStat; | |
size_t globalOffset[] = {0, 0}; | |
size_t globalSize[] = {N}; | |
size_t localSize[] = {64}; | |
clStat = clEnqueueNDRangeKernel(*clQue, *clKrn, 1, globalOffset, | |
globalSize, localSize, 0, NULL, NULL); | |
clFinish(*clQue); | |
// -- read back | |
clEnqueueReadBuffer(*clQue, *clMemOut, CL_TRUE, 0, sizeof(uint32_t)*N*N, | |
hostC, 0, NULL, NULL); | |
return 1; | |
} | |
void readIn() { | |
uint32_t c1, c2; | |
scanf("%d %u %u", &N, &c1, &c2); | |
uint32_t x = 2, n = N*N; | |
x = 2; | |
for (int i = 0; i < N; i++) { | |
for (int j = 0; j < N; j++) { | |
x = (x * x + c1 + i + j)%n; | |
hostA[i*N+j] = x; | |
} | |
} | |
x = 2; | |
for (int i = 0; i < N; i++) { | |
for (int j = 0; j < N; j++) { | |
x = (x * x + c2 + i + j)%n; | |
hostB[i*N+j] = x; | |
} | |
} | |
} | |
void writeOut() { | |
uint32_t h = 0; | |
for (int i = 0; i < N; i++) { | |
for (int j = 0; j < N; j++) | |
h = (h + hostC[i*N+j]) * 2654435761LU; | |
} | |
printf("%u\n", h); | |
} | |
void onStart() { | |
readIn(); | |
initAllGPU(clCallFunc); | |
executeGPU(clCallFunc); | |
writeOut(); | |
destroyGPU(clCallFunc); | |
} | |
void sigHandler(int signo) { | |
printf("God Bless Me"); | |
destroyGPU(clCallFunc); | |
exit(0); | |
} | |
int main(int argc, char *argv[]) { | |
const char sigErr[] = "I can't catch signal.\n"; | |
if (signal(SIGTRAP, sigHandler) == SIG_ERR) | |
fprintf(stderr, sigErr); | |
if (signal(SIGSEGV, sigHandler) == SIG_ERR) | |
fprintf(stderr, sigErr); | |
if (signal(SIGILL, sigHandler) == SIG_ERR) | |
fprintf(stderr, sigErr); | |
if (signal(SIGFPE, sigHandler) == SIG_ERR) | |
fprintf(stderr, sigErr); | |
if (signal(SIGKILL, sigHandler) == SIG_ERR) | |
fprintf(stderr, sigErr); | |
if (signal(SIGINT, sigHandler) == SIG_ERR) | |
fprintf(stderr, sigErr); | |
onStart(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment