Created
May 30, 2012 15:19
-
-
Save gautambak/2836941 to your computer and use it in GitHub Desktop.
cuda memory example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// CUDA-C includes | |
#include <cuda.h> | |
#include <stdio.h> | |
#include "../common/book.h" | |
extern "C" | |
int runCudaPart(); | |
// CODE goes below | |
#define imin(a,b) (a<b?a:b) | |
const int N = 33 * 400 * 400; | |
const int threadsPerBlock = 128; | |
const int blocksPerGrid = | |
imin( 32, (N+threadsPerBlock-1) / threadsPerBlock ); | |
__global__ void dot( int size, float *a, float *b, float *c ) { | |
__shared__ float cache[threadsPerBlock]; | |
int tid = threadIdx.x + blockIdx.x * blockDim.x; | |
int cacheIndex = threadIdx.x; | |
float temp = 0; | |
while (tid < size) { | |
temp += a[tid] * b[tid]; | |
tid += blockDim.x * gridDim.x; | |
} | |
// set the cache values | |
cache[cacheIndex] = temp; | |
// synchronize threads in this block | |
__syncthreads(); | |
// for reductions, threadsPerBlock must be a power of 2 | |
// because of the following code | |
int i = blockDim.x/2; | |
while (i != 0) { | |
if (cacheIndex < i) | |
cache[cacheIndex] += cache[cacheIndex + i]; | |
__syncthreads(); | |
i /= 2; | |
} | |
if (cacheIndex == 0) | |
c[blockIdx.x] = cache[0]; | |
} | |
float malloc_test( int size ) { | |
cudaEvent_t start, stop; | |
float *a, *b, c, *partial_c; | |
float *dev_a, *dev_b, *dev_partial_c; | |
float elapsedTime; | |
HANDLE_ERROR( cudaEventCreate( &start ) ); | |
HANDLE_ERROR( cudaEventCreate( &stop ) ); | |
// allocate memory on the CPU side | |
a = (float*)malloc( size*sizeof(float) ); | |
b = (float*)malloc( size*sizeof(float) ); | |
partial_c = (float*)malloc( blocksPerGrid*sizeof(float) ); | |
// allocate the memory on the GPU | |
HANDLE_ERROR( cudaMalloc( (void**)&dev_a, | |
size*sizeof(float) ) ); | |
HANDLE_ERROR( cudaMalloc( (void**)&dev_b, | |
size*sizeof(float) ) ); | |
HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c, | |
blocksPerGrid*sizeof(float) ) ); | |
// fill in the host memory with data | |
for (int i=0; i<size; i++) { | |
a[i] = i; | |
b[i] = i*2; | |
} | |
HANDLE_ERROR( cudaEventRecord( start, 0 ) ); | |
// copy the arrays 'a' and 'b' to the GPU | |
HANDLE_ERROR( cudaMemcpy( dev_a, a, size*sizeof(float), | |
cudaMemcpyHostToDevice ) ); | |
HANDLE_ERROR( cudaMemcpy( dev_b, b, size*sizeof(float), | |
cudaMemcpyHostToDevice ) ); | |
dot<<<blocksPerGrid,threadsPerBlock>>>( size, dev_a, dev_b, | |
dev_partial_c ); | |
// copy the array 'c' back from the GPU to the CPU | |
HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c, | |
blocksPerGrid*sizeof(float), | |
cudaMemcpyDeviceToHost ) ); | |
HANDLE_ERROR( cudaEventRecord( stop, 0 ) ); | |
HANDLE_ERROR( cudaEventSynchronize( stop ) ); | |
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, | |
start, stop ) ); | |
// finish up on the CPU side | |
c = 0; | |
for (int i=0; i<blocksPerGrid; i++) { | |
c += partial_c[i]; | |
} | |
HANDLE_ERROR( cudaFree( dev_a ) ); | |
HANDLE_ERROR( cudaFree( dev_b ) ); | |
HANDLE_ERROR( cudaFree( dev_partial_c ) ); | |
// free memory on the CPU side | |
free( a ); | |
free( b ); | |
free( partial_c ); | |
// free events | |
HANDLE_ERROR( cudaEventDestroy( start ) ); | |
HANDLE_ERROR( cudaEventDestroy( stop ) ); | |
printf( "Value calculated: %f\n", c ); | |
return elapsedTime; | |
} | |
float cuda_host_alloc_test( int size ) { | |
cudaEvent_t start, stop; | |
float *a, *b, c, *partial_c; | |
float *dev_a, *dev_b, *dev_partial_c; | |
float elapsedTime; | |
HANDLE_ERROR( cudaEventCreate( &start ) ); | |
HANDLE_ERROR( cudaEventCreate( &stop ) ); | |
// allocate the memory on the CPU | |
HANDLE_ERROR( cudaHostAlloc( (void**)&a, | |
size*sizeof(float), | |
cudaHostAllocWriteCombined | | |
cudaHostAllocMapped ) ); | |
HANDLE_ERROR( cudaHostAlloc( (void**)&b, | |
size*sizeof(float), | |
cudaHostAllocWriteCombined | | |
cudaHostAllocMapped ) ); | |
HANDLE_ERROR( cudaHostAlloc( (void**)&partial_c, | |
blocksPerGrid*sizeof(float), | |
cudaHostAllocMapped ) ); | |
// find out the GPU pointers | |
HANDLE_ERROR( cudaHostGetDevicePointer( &dev_a, a, 0 ) ); | |
HANDLE_ERROR( cudaHostGetDevicePointer( &dev_b, b, 0 ) ); | |
HANDLE_ERROR( cudaHostGetDevicePointer( &dev_partial_c, | |
partial_c, 0 ) ); | |
// fill in the host memory with data | |
for (int i=0; i<size; i++) { | |
a[i] = i; | |
b[i] = i*2; | |
} | |
HANDLE_ERROR( cudaEventRecord( start, 0 ) ); | |
dot<<<blocksPerGrid,threadsPerBlock>>>( size, dev_a, dev_b, | |
dev_partial_c ); | |
HANDLE_ERROR( cudaThreadSynchronize() ); | |
HANDLE_ERROR( cudaEventRecord( stop, 0 ) ); | |
HANDLE_ERROR( cudaEventSynchronize( stop ) ); | |
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, | |
start, stop ) ); | |
// finish up on the CPU side | |
c = 0; | |
for (int i=0; i<blocksPerGrid; i++) { | |
c += partial_c[i]; | |
} | |
HANDLE_ERROR( cudaFreeHost( a ) ); | |
HANDLE_ERROR( cudaFreeHost( b ) ); | |
HANDLE_ERROR( cudaFreeHost( partial_c ) ); | |
// free events | |
HANDLE_ERROR( cudaEventDestroy( start ) ); | |
HANDLE_ERROR( cudaEventDestroy( stop ) ); | |
printf( "Value calculated: %f\n", c ); | |
return elapsedTime; | |
} | |
// Main cuda function | |
int runCudaPart() { | |
printf("Hello from cuda \n"); | |
cudaDeviceProp prop; | |
int whichDevice; | |
HANDLE_ERROR( cudaGetDevice( &whichDevice ) ); | |
HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) ); | |
if (prop.canMapHostMemory != 1) { | |
printf( "Device can not map memory.\n" ); | |
return 0; | |
} | |
float elapsedTime; | |
HANDLE_ERROR( cudaSetDeviceFlags( cudaDeviceMapHost ) ); | |
// try it with malloc | |
elapsedTime = malloc_test( N ); | |
printf( "Time using cudaMalloc: %3.1f ms\n", | |
elapsedTime ); | |
// now try it with cudaHostAlloc | |
elapsedTime = cuda_host_alloc_test( N ); | |
printf( "Time using cudaHostAlloc: %3.1f ms\n", | |
elapsedTime ); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment