Last active
March 31, 2018 00:29
-
-
Save andreinechaev/2f69736e3adea40c4a52ab9de3cd6996 to your computer and use it in GitHub Desktop.
A solution to Nvidia Cuda Course.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#define N 64 | |
inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) { | |
if (err != cudaSuccess) { | |
fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err)); | |
} | |
return err; | |
} | |
__global__ void matrixMulGPU( int * a, int * b, int * c ) | |
{ | |
/* | |
* Build out this kernel. | |
*/ | |
int row = threadIdx.y + blockIdx.y * blockDim.y; | |
int col = threadIdx.x + blockIdx.x * blockDim.x; | |
int val = 0; | |
if (row < N && col < N) { | |
for (int i = 0; i < N; ++i) { | |
val += a[row * N + i] * b[i * N + col]; | |
} | |
c[row * N + col] = val; | |
} | |
} | |
/* | |
* This CPU function already works, and will run to create a solution matrix | |
* against which to verify your work building out the matrixMulGPU kernel. | |
*/ | |
void matrixMulCPU( int * a, int * b, int * c ) | |
{ | |
int val = 0; | |
for( int row = 0; row < N; ++row ) | |
for( int col = 0; col < N; ++col ) | |
{ | |
val = 0; | |
for ( int k = 0; k < N; ++k ) | |
val += a[row * N + k] * b[k * N + col]; | |
c[row * N + col] = val; | |
} | |
} | |
int main() | |
{ | |
int *a, *b, *c_cpu, *c_gpu; // Allocate a solution matrix for both the CPU and the GPU operations | |
int size = N * N * sizeof (int); // Number of bytes of an N x N matrix | |
// Allocate memory | |
cudaMallocManaged (&a, size); | |
cudaMallocManaged (&b, size); | |
cudaMallocManaged (&c_cpu, size); | |
cudaMallocManaged (&c_gpu, size); | |
// Initialize memory; create 2D matrices | |
for( int row = 0; row < N; ++row ) | |
for( int col = 0; col < N; ++col ) | |
{ | |
a[row*N + col] = row; | |
b[row*N + col] = col+2; | |
c_cpu[row*N + col] = 0; | |
c_gpu[row*N + col] = 0; | |
} | |
/* | |
* Assign `threads_per_block` and `number_of_blocks` 2D values | |
* that can be used in matrixMulGPU above. | |
*/ | |
dim3 threads_per_block(32, 32, 1); | |
dim3 number_of_blocks(N / threads_per_block.x + 1, N / threads_per_block.y + 1, 1); | |
matrixMulGPU <<< number_of_blocks, threads_per_block >>> ( a, b, c_gpu ); | |
checkCudaErr(cudaDeviceSynchronize(), "Syncronization"); | |
checkCudaErr(cudaGetLastError(), "GPU"); | |
// Call the CPU version to check our work | |
matrixMulCPU( a, b, c_cpu ); | |
// Compare the two answers to make sure they are equal | |
bool error = false; | |
for( int row = 0; row < N && !error; ++row ) | |
for( int col = 0; col < N && !error; ++col ) | |
if (c_cpu[row * N + col] != c_gpu[row * N + col]) | |
{ | |
printf("FOUND ERROR at c[%d][%d]\n", row, col); | |
error = true; | |
break; | |
} | |
if (!error) | |
printf("Success!\n"); | |
// Free all our allocated memory | |
cudaFree(a); cudaFree(b); | |
cudaFree( c_cpu ); cudaFree( c_gpu ); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment