Skip to content

Instantly share code, notes, and snippets.

@sandeepkumar-skb
Last active January 30, 2021 04:34
Show Gist options
  • Save sandeepkumar-skb/341cd017139f9a47ad68ab71013d0b54 to your computer and use it in GitHub Desktop.
Save sandeepkumar-skb/341cd017139f9a47ad68ab71013d0b54 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <stdio.h>
#define BLOCK_SIZE 16
inline void gpuAssert(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess){
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define gpuErrchk(ans) \
{ \
gpuAssert((ans), __FILE__, __LINE__); \
}
template<typename T>
__global__
void matmul_d(const T* __restrict__ A,
const T* __restrict__ B,
T* C,
int M,
int N,
int K){
int row = blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
float t_val = 0.0f;
if(row < M && col < N){
for (int k=0; k<K; ++k){
t_val += A[row*K + k] * B[k*K + col];
}
C[row*N + col] = static_cast<T>(t_val);
}
}
template<typename T>
void matmul_golden(const T* A,
const T* B,
T* C,
int M,
int N,
int K){
for(int row=0; row<M; ++row){
for (int col=0; col<N; ++col){
float t_val = 0.0f;
for (int k=0; k<K; ++k){
t_val += A[row*K + k] * B[k*K + col];
}
C[row*N + col] = static_cast<T>(t_val);
}
}
}
// MxK, KxN matrix
int main(){
float *A, *B, *C, *gC;
int M = 4096;
int N = 2048;
int K = 1024;
gpuErrchk(cudaMallocManaged(&A, M*K*sizeof(float)));
gpuErrchk(cudaMallocManaged(&B, K*N*sizeof(float)));
gpuErrchk(cudaMallocManaged(&C, M*N*sizeof(float)));
gpuErrchk(cudaMallocManaged(&gC, M*N*sizeof(float)));
for (int i=0; i < M*K; ++i)
A[i] = i;
for (int i=0; i < K*N; ++i)
B[i] = 2;
for (int i=0; i < M*N; ++i)
{
C[i] = 0;
gC[i] = 0;
}
matmul_golden<float>(A, B, gC, M, N, K);
cudaEvent_t start, stop;
gpuErrchk(cudaEventCreate(&start));
gpuErrchk(cudaEventCreate(&stop));
dim3 num_threads(BLOCK_SIZE, BLOCK_SIZE,1);
dim3 num_blocks((N-1)/BLOCK_SIZE + 1, (M-1)/BLOCK_SIZE + 1, 1);
gpuErrchk(cudaEventRecord(start));
matmul_d<float><<<num_blocks, num_threads>>>(A, B, C, M, N, K);
gpuErrchk(cudaEventRecord(stop));
gpuErrchk(cudaEventSynchronize(stop));
float milliseconds = 0;
gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));
float maxError = 0.0f;
for(int i=0; i < M*N; ++i){
if (maxError < abs(C[i] - gC[i])){
maxError = abs(C[i] - gC[i]);
}
}
cudaFree(A);
cudaFree(B);
cudaFree(C);
cudaFree(gC);
printf("Max error: %f\n", maxError);
printf("Effective time: %.3f ms\n", milliseconds);
float flops = 2 * (float)M* (float)K* (float)N;
printf("Effective Flops: %.3f GFlops\n", flops/milliseconds/1e+6);
}
@sandeepkumar-skb
Copy link
Author

sandeepkumar-skb commented Dec 14, 2020

Compile: nvcc matmul.cu -o matmul && ./matmul
Results:

Max error: 0.000000
Effective time: 9.809 ms
Effective Flops: 218.920 GFlops

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment