Created
April 1, 2014 18:17
-
-
Save jatesy/9919846 to your computer and use it in GitHub Desktop.
Graphic Processing Units(GPU) programming: Matrix multiplication implemented parallel in Cuda
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <math.h> | |
#include <time.h> | |
#include <cuda.h> | |
// Thread block size | |
#define BLOCK_SIZE 8 //We can change this to 16, 32, 64, 128, 256, 512 and 1024 | |
// Matrix dimensions | |
// (chosen as multiples of the thread block size for simplicity) | |
#define WA 8 // Matrix A width, we can change this to 16, 32, 64, 128, 256, 512 and 1024 | |
#define HA 8 // Matrix A height, we can change this to 16, 32, 64, 128, 256, 512 and 1024 | |
#define WB 8 // Matrix B width, we can change this to 16, 32, 64, 128, 256, 512 and 1024 | |
#define HB WA // Matrix B height, we can change this to 16, 32, 64, 128, 256, 512 and 1024 | |
#define WC WB // Matrix C width, we can change this to 16, 32, 64, 128, 256, 512 and 1024 | |
#define HC HA // Matrix C height, we can change this to 16, 32, 64, 128, 256, 512 and 1024 | |
// Allocates a matrix with random float entries. | |
void randomInit(float* data, int size) | |
{ | |
for (int i = 0; i < size; ++i) | |
data[i] = rand() / (float)RAND_MAX; | |
} | |
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width) | |
{ | |
// Calculate the row index of the Pd element and M | |
int Row = blockIdx.y*BLOCK_SIZE + threadIdx.y; | |
// Calculate the column idenx of Pd and N | |
int Col = blockIdx.x*BLOCK_SIZE + threadIdx.x; | |
float Pvalue = 0; | |
// each thread computes one element of the block sub-matrix | |
for (int k = 0; k < Width; ++k) | |
Pvalue += Md[Row*Width+k] * Nd[k*Width+Col]; | |
Pd[Row*Width+Col] = Pvalue; | |
} | |
void MatrixMulOnHost(float* M, float* N, float* P, int Width) | |
{ | |
for (int i = 0; i < Width; ++i) | |
for (int j = 0; j < Width; ++j) { | |
double sum = 0; | |
for (int k = 0; k < Width; ++k) { | |
double a = M[i * Width + k]; | |
double b = N[k * Width + j]; | |
sum += a * b; | |
} | |
P[i * Width + j] = sum; | |
} | |
} | |
int main() | |
{ | |
//cudaSetDevice( cutGetMaxGflopsDeviceId() ); | |
srand(2006); | |
double ctime,cudatime,htime; | |
clock_t stime,stime2,etime, etime2; | |
// allocate host memory for matrices A and B | |
unsigned int size_A = WA * HA; | |
unsigned int mem_size_A = sizeof(float) * size_A; | |
float* h_A = (float*) malloc(mem_size_A); | |
unsigned int size_B = WB * HB; | |
unsigned int mem_size_B = sizeof(float) * size_B; | |
float* h_B = (float*) malloc(mem_size_B); | |
// initialize host memory | |
randomInit(h_A, size_A); | |
randomInit(h_B, size_B); | |
stime = clock(); | |
// allocate device memory | |
float* d_A; | |
cudaMalloc((void**) &d_A, mem_size_A); | |
float* d_B; | |
cudaMalloc((void**) &d_B, mem_size_B); | |
// copy host memory to device | |
cudaMemcpy(d_A, h_A, mem_size_A,cudaMemcpyHostToDevice) ; | |
cudaMemcpy(d_B, h_B, mem_size_B,cudaMemcpyHostToDevice); | |
// allocate device memory for result | |
unsigned int size_C = WC * HC; | |
unsigned int mem_size_C = sizeof(float) * size_C; | |
float* d_C; | |
cudaMalloc((void**) &d_C, mem_size_C); | |
stime2 = clock(); | |
// setup execution parameters | |
dim3 threads(BLOCK_SIZE, BLOCK_SIZE); | |
dim3 grid(1,1); | |
dim3 block(1,1); | |
// execute the kernel | |
MatrixMulKernel<<< grid, threads >>>(d_A, d_B, d_C, WB); | |
cudaThreadSynchronize(); | |
etime2 = clock(); | |
cudatime = (double)(etime2 - stime2) / CLOCKS_PER_SEC; | |
// allocate host memory for the result | |
float* h_C = (float*) malloc(mem_size_C); | |
// copy result from device to host | |
cudaMemcpy(h_C, d_C, mem_size_C,cudaMemcpyDeviceToHost); | |
etime = clock(); | |
ctime = (double)(etime - stime) / CLOCKS_PER_SEC; | |
// compute reference solution | |
float* reference = (float*) malloc(mem_size_C); | |
stime = clock(); | |
MatrixMulOnHost(h_A, h_B, reference, WB); | |
etime = clock(); | |
htime = (double)(etime - stime) / CLOCKS_PER_SEC; | |
printf("host: %.10f\ncuda: %.10f\ncuda, w/copy: %.10f\n",htime,cudatime,ctime); | |
// clean up memory | |
free(h_A); | |
free(h_B); | |
free(h_C); | |
free(reference); | |
cudaFree(d_A); | |
cudaFree(d_B); | |
cudaFree(d_C); | |
cudaThreadExit(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment