Skip to content

Instantly share code, notes, and snippets.

@sandeepkumar-skb
Last active January 25, 2021 04:32
Show Gist options
  • Save sandeepkumar-skb/918407d7f115e545d129bd362f40b69d to your computer and use it in GitHub Desktop.
Save sandeepkumar-skb/918407d7f115e545d129bd362f40b69d to your computer and use it in GitHub Desktop.
#include <iostream>
#include <stdio.h>
#define TILE_WIDTH 32
__global__
void matmul_d(float* A, float* B, float* C, int M, int N, int K){
__shared__ float shmem_A[TILE_WIDTH][TILE_WIDTH] ;
__shared__ float shmem_B[TILE_WIDTH][TILE_WIDTH] ;
int row = blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
int tx = threadIdx.x; int ty = threadIdx.y;
float temp = 0.0f;
for (int phase=0; phase < K/TILE_WIDTH; ++phase){
if (row < M && phase*TILE_WIDTH+tx < K)
shmem_A[ty][tx] = A[row*K + phase*TILE_WIDTH + tx];
else
shmem_A[ty][tx] = 0.0f;
if (col < N && (ty + phase*TILE_WIDTH) < K)
shmem_B[ty][tx] = B[(ty + phase*TILE_WIDTH)*N + col];
else
shmem_B[ty][tx] = 0.0f;
__syncthreads();
for (int i=0; i < TILE_WIDTH; ++i){
temp += shmem_A[ty][i]*shmem_B[i][tx];
}
__syncthreads();
}
if (row < M && col < N)
C[row*N + col] = temp;
}
template<typename T>
void matmul_golden(const T* A,
const T* B,
T* C,
int M,
int N,
int K){
for(int row=0; row<M; ++row){
for (int col=0; col<N; ++col){
float t_val = 0.0f;
for (int k=0; k<K; ++k){
t_val += A[row*K + k] * B[k*K + col];
}
C[row*N + col] = static_cast<T>(t_val);
}
}
}
// MxK KxN -> MxN
int main(){
float *A, *B, *C, *gC;
int M = 4096;
int N = 2045;
int K = 1024;
cudaMallocManaged(&A, M*K*sizeof(float));
cudaMallocManaged(&B, K*N*sizeof(float));
cudaMallocManaged(&C, M*N*sizeof(float));
cudaMallocManaged(&gC, M*N*sizeof(float));
for (int i=0; i < M*K; ++i)
A[i] = i;
for (int i=0; i < K*N; ++i)
B[i] = 2;
for (int i=0; i < M*N; ++i)
{
C[i] = 0;
gC[i] = 0;
}
matmul_golden(A, B, gC, M, N, K);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
dim3 num_threads(TILE_WIDTH, TILE_WIDTH,1);
dim3 num_blocks((N-1)/TILE_WIDTH + 1, (M-1)/TILE_WIDTH + 1, 1);
//matmul_d<<<num_blocks, num_threads>>>(A, B, C, M, N, K);
cudaEventRecord(start);
matmul_d<<<num_blocks, num_threads>>>(A, B, C, M, N, K);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
float maxError = 0.0f;
int idx = 0;
for(int i=0; i < M*N; ++i){
if (maxError < abs(C[i] - gC[i])){
maxError = abs(C[i] - gC[i]);
idx = i;
}
}
cudaFree(A);
cudaFree(B);
cudaFree(C);
cudaFree(gC);
printf("Max error: %f idx: %d\n", maxError, idx);
printf("Effective time: %.3f ms\n", milliseconds);
float flops = 2 * (float)M* (float)K* (float)N;
printf("Effective Flops: %.3f GFlops\n", flops/milliseconds/1e+6);
}
@sandeepkumar-skb
Copy link
Author

sandeepkumar-skb commented Dec 14, 2020

nvcc tiled_matmul_test.cu -o tiled_matmul && ./tiled_matmul

Effective time: 35.756 ms
Effective Flops: 479.760 GFlops

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment