Last active
January 30, 2021 04:34
-
-
Save sandeepkumar-skb/341cd017139f9a47ad68ab71013d0b54 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <stdio.h> | |
| #define BLOCK_SIZE 16 | |
| inline void gpuAssert(cudaError_t err, const char *file, int line) | |
| { | |
| if (err != cudaSuccess){ | |
| printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line); | |
| exit(EXIT_FAILURE); | |
| } | |
| } | |
| #define gpuErrchk(ans) \ | |
| { \ | |
| gpuAssert((ans), __FILE__, __LINE__); \ | |
| } | |
| template<typename T> | |
| __global__ | |
| void matmul_d(const T* __restrict__ A, | |
| const T* __restrict__ B, | |
| T* C, | |
| int M, | |
| int N, | |
| int K){ | |
| int row = blockIdx.y*blockDim.y + threadIdx.y; | |
| int col = blockIdx.x*blockDim.x + threadIdx.x; | |
| float t_val = 0.0f; | |
| if(row < M && col < N){ | |
| for (int k=0; k<K; ++k){ | |
| t_val += A[row*K + k] * B[k*K + col]; | |
| } | |
| C[row*N + col] = static_cast<T>(t_val); | |
| } | |
| } | |
| template<typename T> | |
| void matmul_golden(const T* A, | |
| const T* B, | |
| T* C, | |
| int M, | |
| int N, | |
| int K){ | |
| for(int row=0; row<M; ++row){ | |
| for (int col=0; col<N; ++col){ | |
| float t_val = 0.0f; | |
| for (int k=0; k<K; ++k){ | |
| t_val += A[row*K + k] * B[k*K + col]; | |
| } | |
| C[row*N + col] = static_cast<T>(t_val); | |
| } | |
| } | |
| } | |
| // MxK, KxN matrix | |
| int main(){ | |
| float *A, *B, *C, *gC; | |
| int M = 4096; | |
| int N = 2048; | |
| int K = 1024; | |
| gpuErrchk(cudaMallocManaged(&A, M*K*sizeof(float))); | |
| gpuErrchk(cudaMallocManaged(&B, K*N*sizeof(float))); | |
| gpuErrchk(cudaMallocManaged(&C, M*N*sizeof(float))); | |
| gpuErrchk(cudaMallocManaged(&gC, M*N*sizeof(float))); | |
| for (int i=0; i < M*K; ++i) | |
| A[i] = i; | |
| for (int i=0; i < K*N; ++i) | |
| B[i] = 2; | |
| for (int i=0; i < M*N; ++i) | |
| { | |
| C[i] = 0; | |
| gC[i] = 0; | |
| } | |
| matmul_golden<float>(A, B, gC, M, N, K); | |
| cudaEvent_t start, stop; | |
| gpuErrchk(cudaEventCreate(&start)); | |
| gpuErrchk(cudaEventCreate(&stop)); | |
| dim3 num_threads(BLOCK_SIZE, BLOCK_SIZE,1); | |
| dim3 num_blocks((N-1)/BLOCK_SIZE + 1, (M-1)/BLOCK_SIZE + 1, 1); | |
| gpuErrchk(cudaEventRecord(start)); | |
| matmul_d<float><<<num_blocks, num_threads>>>(A, B, C, M, N, K); | |
| gpuErrchk(cudaEventRecord(stop)); | |
| gpuErrchk(cudaEventSynchronize(stop)); | |
| float milliseconds = 0; | |
| gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop)); | |
| float maxError = 0.0f; | |
| for(int i=0; i < M*N; ++i){ | |
| if (maxError < abs(C[i] - gC[i])){ | |
| maxError = abs(C[i] - gC[i]); | |
| } | |
| } | |
| cudaFree(A); | |
| cudaFree(B); | |
| cudaFree(C); | |
| cudaFree(gC); | |
| printf("Max error: %f\n", maxError); | |
| printf("Effective time: %.3f ms\n", milliseconds); | |
| float flops = 2 * (float)M* (float)K* (float)N; | |
| printf("Effective Flops: %.3f GFlops\n", flops/milliseconds/1e+6); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Compile:
nvcc matmul.cu -o matmul && ./matmulResults: