Last active
May 5, 2020 10:51
-
-
Save oak-tree/f270a6cc5666de61ad81d84c3253d6eb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <assert.h> | |
inline cudaError_t checkCuda(cudaError_t result) | |
{ | |
if (result != cudaSuccess) { | |
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); | |
assert(result == cudaSuccess); | |
} | |
return result; | |
} | |
inline cudaError_t checkLastCuda() | |
{ | |
cudaError_t result = cudaGetLastError(); | |
if (result != cudaSuccess) { | |
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); | |
assert(result == cudaSuccess); | |
} | |
return result; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <assert.h> | |
inline cudaError_t checkCuda(cudaError_t result) | |
{ | |
if (result != cudaSuccess) { | |
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); | |
assert(result == cudaSuccess); | |
} | |
return result; | |
} | |
inline cudaError_t checkLastCuda() | |
{ | |
cudaError_t result = cudaGetLastError(); | |
if (result != cudaSuccess) { | |
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); | |
assert(result == cudaSuccess); | |
} | |
return result; | |
} | |
void initWith(float num, float *a, int N) | |
{ | |
for(int i = 0; i < N; ++i) | |
{ | |
a[i] = num; | |
} | |
} | |
__global__ void addVectorsInto(float *result, float *a, float*b, int N){ | |
int indexWithinTheGrid = threadIdx.x + blockIdx.x * blockDim.x; | |
int gridStride = gridDim.x * blockDim.x; | |
for (int i = indexWithinTheGrid; i < N; i += gridStride) | |
{ | |
result[i] = a[i] + b[i]; | |
} | |
} | |
void checkElementsAre(float target, float *array, int N) | |
{ | |
for(int i = 0; i < N; i++) | |
{ | |
if(array[i] != target) | |
{ | |
printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target); | |
exit(1); | |
} | |
} | |
printf("SUCCESS! All values added correctly.\n"); | |
} | |
int main() | |
{ | |
const int N = 2<<20; | |
size_t size = N * sizeof(float); | |
float *a; | |
float *b; | |
float *c; | |
cudaMallocManaged(&a, size); | |
cudaMallocManaged(&b, size); | |
cudaMallocManaged(&c, size); | |
initWith(3, a, N); | |
initWith(4, b, N); | |
initWith(0, c, N); | |
int blocks = 10; | |
int threads = 1; | |
addVectorsInto<<<blocks,threads>>>(c, a, b, N); | |
checkLastCuda(); | |
checkCuda(cudaDeviceSynchronize()); | |
checkElementsAre(7, c, N); | |
cudaFree(a); | |
cudaFree(b); | |
cudaFree(c); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <assert.h> | |
inline cudaError_t checkCuda(cudaError_t result) | |
{ | |
if (result != cudaSuccess) { | |
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); | |
assert(result == cudaSuccess); | |
} | |
return result; | |
} | |
inline cudaError_t checkLastCuda() | |
{ | |
cudaError_t result = cudaGetLastError(); | |
if (result != cudaSuccess) { | |
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); | |
assert(result == cudaSuccess); | |
} | |
return result; | |
} | |
#define N 64 | |
__global__ void matrixMulGPU( int * a, int * b, int * c) | |
{ | |
int row = threadIdx.x + blockIdx.x * blockDim.x; | |
int col = threadIdx.y + blockIdx.y * blockDim.y; | |
if (row >= N) | |
return; | |
if (col >= N) | |
return; | |
int val = 0; | |
for ( int k = 0; k < N; ++k ) | |
val += a[row * N + k] * b[k * N + col]; | |
c[row * N + col] = val; | |
/* | |
* Build out this kernel. | |
*/ | |
} | |
/* | |
* This CPU function already works, and will run to create a solution matrix | |
* against which to verify your work building out the matrixMulGPU kernel. | |
*/ | |
void matrixMulCPU( int * a, int * b, int * c ) | |
{ | |
int val = 0; | |
for( int row = 0; row < N; ++row ) | |
for( int col = 0; col < N; ++col ) | |
{ | |
val = 0; | |
for ( int k = 0; k < N; ++k ) | |
val += a[row * N + k] * b[k * N + col]; | |
c[row * N + col] = val; | |
} | |
} | |
int main() | |
{ | |
int *a, *b, *c_cpu, *c_gpu; // Allocate a solution matrix for both the CPU and the GPU operations | |
int size = N * N * sizeof (int); // Number of bytes of an N x N matrix | |
// Allocate memory | |
cudaMallocManaged (&a, size); | |
cudaMallocManaged (&b, size); | |
cudaMallocManaged (&c_cpu, size); | |
cudaMallocManaged (&c_gpu, size); | |
// Initialize memory; create 2D matrices | |
for( int row = 0; row < N; ++row ) | |
for( int col = 0; col < N; ++col ) | |
{ | |
a[row*N + col] = row; | |
b[row*N + col] = col+2; | |
c_cpu[row*N + col] = 0; | |
c_gpu[row*N + col] = 0; | |
} | |
/* | |
* Assign `threads_per_block` and `number_of_blocks` 2D values | |
* that can be used in matrixMulGPU above. | |
*/ | |
dim3 threads_per_block(16,16,1); | |
dim3 number_of_blocks(N/threads_per_block.x,N/threads_per_block.y,1); | |
matrixMulGPU <<< number_of_blocks, threads_per_block >>> ( a, b, c_gpu ); | |
checkLastCuda(); | |
checkCuda(cudaDeviceSynchronize()); | |
// Call the CPU version to check our work | |
matrixMulCPU( a, b, c_cpu ); | |
// Compare the two answers to make sure they are equal | |
bool error = false; | |
for( int row = 0; row < N && !error; ++row ) | |
for( int col = 0; col < N && !error; ++col ) | |
if (c_cpu[row * N + col] != c_gpu[row * N + col]) | |
{ | |
printf("FOUND ERROR at c[%d][%d]\n", row, col); | |
error = true; | |
break; | |
} | |
if (!error) | |
printf("Success!\n"); | |
// Free all our allocated memory | |
cudaFree(a); cudaFree(b); | |
cudaFree( c_cpu ); cudaFree( c_gpu ); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment