Created
April 1, 2014 18:19
-
-
Save jatesy/9919894 to your computer and use it in GitHub Desktop.
Graphic Processing Units(GPU) programming: Vectors addition implemented parallel in Cuda
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <math.h> | |
#include <time.h> | |
#include <cuda.h> | |
// block size | |
#define BLOCK_SIZE 512 | |
#define VECTOR_SIZE 100 // We can change the value of W to 200, 400, 800, 1600, 3200 | |
// Allocates a vector with random float entries. | |
void randomInit(float* data, int size) | |
{ | |
for (int i = 0; i < size; ++i) | |
data[i] = rand() / (float)RAND_MAX; | |
} | |
__global__ void VectorAdd(float* Md, float* Nd, float* Pd) | |
{ | |
// Calculate the index of the Pd element and M and N | |
int index = blockIdx.x * BLOCK_SIZE + threadIdx.x; | |
// each thread computes one element of the block sub-vector | |
Pd[index] = Md[index] * Nd[index]; | |
} | |
int main() | |
{ | |
//cudaSetDevice( cutGetMaxGflopsDeviceId() ); | |
srand(2006); | |
double time; | |
clock_t stime = clock(); | |
clock_t etime; | |
// allocate host memory for vectors A and B | |
unsigned int size_A = VECTOR_SIZE; | |
unsigned int mem_size_A = sizeof(float) * size_A; | |
float* h_A = (float*) malloc(mem_size_A); | |
unsigned int size_B = VECTOR_SIZE; | |
unsigned int mem_size_B = sizeof(float) * size_B; | |
float* h_B = (float*) malloc(mem_size_B); | |
// initialize host memory | |
randomInit(h_A, size_A); | |
randomInit(h_B, size_B); | |
// allocate device memory | |
float* d_A; | |
cudaMalloc((void**) &d_A, mem_size_A); | |
float* d_B; | |
cudaMalloc((void**) &d_B, mem_size_B); | |
// copy host memory to device | |
cudaMemcpy(d_A, h_A, mem_size_A,cudaMemcpyHostToDevice) ; | |
cudaMemcpy(d_B, h_B, mem_size_B,cudaMemcpyHostToDevice); | |
// allocate device memory for result | |
unsigned int size_C = VECTOR_SIZE; | |
unsigned int mem_size_C = sizeof(float) * size_C; | |
float* d_C; | |
cudaMalloc((void**) &d_C, mem_size_C); | |
// setup execution parameters | |
dim3 threads(1, BLOCK_SIZE); | |
dim3 grid(VECTOR_SIZE / threads.x, VECTOR_SIZE / threads.y); | |
// execute the kernel | |
VectorAdd<<< grid, threads >>>(d_A, d_B, d_C); | |
cudaThreadSynchronize(); | |
// allocate host memory for the result | |
float* h_C = (float*) malloc(mem_size_C); | |
// copy result from device to host | |
cudaMemcpy(h_C, d_C, mem_size_C,cudaMemcpyDeviceToHost); | |
// compute reference solution | |
float* reference = (float*) malloc(mem_size_C); | |
// clean up memory | |
free(h_A); | |
free(h_B); | |
free(h_C); | |
free(reference); | |
cudaFree(d_A); | |
cudaFree(d_B); | |
cudaFree(d_C); | |
cudaThreadExit(); | |
//record the time | |
etime = clock(); | |
time = (double)(etime - stime) / CLOCKS_PER_SEC; | |
printf("CUDA time: %.10f\n",time); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment