Last active
January 25, 2021 04:32
-
-
Save sandeepkumar-skb/918407d7f115e545d129bd362f40b69d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <stdio.h> | |
#define TILE_WIDTH 32 | |
__global__ | |
void matmul_d(float* A, float* B, float* C, int M, int N, int K){ | |
__shared__ float shmem_A[TILE_WIDTH][TILE_WIDTH] ; | |
__shared__ float shmem_B[TILE_WIDTH][TILE_WIDTH] ; | |
int row = blockIdx.y*blockDim.y + threadIdx.y; | |
int col = blockIdx.x*blockDim.x + threadIdx.x; | |
int tx = threadIdx.x; int ty = threadIdx.y; | |
float temp = 0.0f; | |
for (int phase=0; phase < K/TILE_WIDTH; ++phase){ | |
if (row < M && phase*TILE_WIDTH+tx < K) | |
shmem_A[ty][tx] = A[row*K + phase*TILE_WIDTH + tx]; | |
else | |
shmem_A[ty][tx] = 0.0f; | |
if (col < N && (ty + phase*TILE_WIDTH) < K) | |
shmem_B[ty][tx] = B[(ty + phase*TILE_WIDTH)*N + col]; | |
else | |
shmem_B[ty][tx] = 0.0f; | |
__syncthreads(); | |
for (int i=0; i < TILE_WIDTH; ++i){ | |
temp += shmem_A[ty][i]*shmem_B[i][tx]; | |
} | |
__syncthreads(); | |
} | |
if (row < M && col < N) | |
C[row*N + col] = temp; | |
} | |
template<typename T> | |
void matmul_golden(const T* A, | |
const T* B, | |
T* C, | |
int M, | |
int N, | |
int K){ | |
for(int row=0; row<M; ++row){ | |
for (int col=0; col<N; ++col){ | |
float t_val = 0.0f; | |
for (int k=0; k<K; ++k){ | |
t_val += A[row*K + k] * B[k*K + col]; | |
} | |
C[row*N + col] = static_cast<T>(t_val); | |
} | |
} | |
} | |
// MxK KxN -> MxN | |
int main(){ | |
float *A, *B, *C, *gC; | |
int M = 4096; | |
int N = 2045; | |
int K = 1024; | |
cudaMallocManaged(&A, M*K*sizeof(float)); | |
cudaMallocManaged(&B, K*N*sizeof(float)); | |
cudaMallocManaged(&C, M*N*sizeof(float)); | |
cudaMallocManaged(&gC, M*N*sizeof(float)); | |
for (int i=0; i < M*K; ++i) | |
A[i] = i; | |
for (int i=0; i < K*N; ++i) | |
B[i] = 2; | |
for (int i=0; i < M*N; ++i) | |
{ | |
C[i] = 0; | |
gC[i] = 0; | |
} | |
matmul_golden(A, B, gC, M, N, K); | |
cudaEvent_t start, stop; | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop); | |
dim3 num_threads(TILE_WIDTH, TILE_WIDTH,1); | |
dim3 num_blocks((N-1)/TILE_WIDTH + 1, (M-1)/TILE_WIDTH + 1, 1); | |
//matmul_d<<<num_blocks, num_threads>>>(A, B, C, M, N, K); | |
cudaEventRecord(start); | |
matmul_d<<<num_blocks, num_threads>>>(A, B, C, M, N, K); | |
cudaEventRecord(stop); | |
cudaEventSynchronize(stop); | |
float milliseconds = 0; | |
cudaEventElapsedTime(&milliseconds, start, stop); | |
float maxError = 0.0f; | |
int idx = 0; | |
for(int i=0; i < M*N; ++i){ | |
if (maxError < abs(C[i] - gC[i])){ | |
maxError = abs(C[i] - gC[i]); | |
idx = i; | |
} | |
} | |
cudaFree(A); | |
cudaFree(B); | |
cudaFree(C); | |
cudaFree(gC); | |
printf("Max error: %f idx: %d\n", maxError, idx); | |
printf("Effective time: %.3f ms\n", milliseconds); | |
float flops = 2 * (float)M* (float)K* (float)N; | |
printf("Effective Flops: %.3f GFlops\n", flops/milliseconds/1e+6); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
nvcc tiled_matmul_test.cu -o tiled_matmul && ./tiled_matmul