sandeepkumar-skb · January 25, 2021 04:32 · sandeepkumar-skb · Dec 14, 2020
diff --git a/tiled_matmul.cu b/tiled_matmul.cu
 #include <iostream>
 #include <stdio.h>

 #define TILE_WIDTH 32

 __global__
 void matmul_d(float* A, float* B, float* C, int M, int N, int K){
    __shared__ float shmem_A[TILE_WIDTH][TILE_WIDTH] ;
    __shared__ float shmem_B[TILE_WIDTH][TILE_WIDTH] ;
    int row = blockIdx.y*blockDim.y + threadIdx.y;
    int col = blockIdx.x*blockDim.x + threadIdx.x;
    int tx = threadIdx.x; int ty = threadIdx.y;
    float temp = 0.0f;
    for (int phase=0; phase < K/TILE_WIDTH; ++phase){
        if (row < M && phase*TILE_WIDTH+tx < K)
            shmem_A[ty][tx] = A[row*K + phase*TILE_WIDTH + tx];
        else
            shmem_A[ty][tx] = 0.0f;
        if (col < N && (ty + phase*TILE_WIDTH) < K)
            shmem_B[ty][tx] = B[(ty + phase*TILE_WIDTH)*N + col];
        else
            shmem_B[ty][tx] = 0.0f;

        __syncthreads();
        for (int i=0; i < TILE_WIDTH; ++i){
            temp += shmem_A[ty][i]*shmem_B[i][tx];
        }
        __syncthreads();
    }
    if (row < M && col < N)
        C[row*N + col] = temp;
 }

 template<typename T>
 void matmul_golden(const T* A,
        const T* B,
        T* C,
        int M,
        int N,
        int K){

    for(int row=0; row<M; ++row){
        for (int col=0; col<N; ++col){
            float t_val = 0.0f;
            for (int k=0; k<K; ++k){
                t_val += A[row*K + k] * B[k*K + col];
            }
            C[row*N + col] = static_cast<T>(t_val);
        }
    }
 }

 // MxK KxN -> MxN
 int main(){
    float *A, *B, *C, *gC;
    int M = 4096;
    int N = 2045;
    int K = 1024;

    cudaMallocManaged(&A, M*K*sizeof(float));
    cudaMallocManaged(&B, K*N*sizeof(float));
    cudaMallocManaged(&C, M*N*sizeof(float));
    cudaMallocManaged(&gC, M*N*sizeof(float));

    for (int i=0; i < M*K; ++i)
        A[i] = i;
    for (int i=0; i < K*N; ++i)
        B[i] = 2;
    for (int i=0; i < M*N; ++i)
    {
        C[i] = 0;
        gC[i] = 0;
    }

    matmul_golden(A, B, gC, M, N, K);
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    dim3 num_threads(TILE_WIDTH, TILE_WIDTH,1);
    dim3 num_blocks((N-1)/TILE_WIDTH + 1, (M-1)/TILE_WIDTH + 1, 1);
    //matmul_d<<<num_blocks, num_threads>>>(A, B, C, M, N, K);
    cudaEventRecord(start);
    matmul_d<<<num_blocks, num_threads>>>(A, B, C, M, N, K);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    float maxError = 0.0f;
    int idx = 0;
    for(int i=0; i < M*N; ++i){
        if (maxError < abs(C[i] - gC[i])){
            maxError = abs(C[i] - gC[i]);
            idx = i;
        }
    }
    cudaFree(A);
    cudaFree(B);
    cudaFree(C);
    cudaFree(gC);
    printf("Max error: %f idx: %d\n", maxError, idx);
    printf("Effective time: %.3f ms\n", milliseconds);
    float flops = 2 * (float)M* (float)K* (float)N;
    printf("Effective Flops: %.3f GFlops\n", flops/milliseconds/1e+6);
 }
	#include <iostream>
	#include <stdio.h>

	#define TILE_WIDTH 32

	__global__
	void matmul_d(float* A, float* B, float* C, int M, int N, int K){
	__shared__ float shmem_A[TILE_WIDTH][TILE_WIDTH] ;
	__shared__ float shmem_B[TILE_WIDTH][TILE_WIDTH] ;
	int row = blockIdx.y*blockDim.y + threadIdx.y;
	int col = blockIdx.x*blockDim.x + threadIdx.x;
	int tx = threadIdx.x; int ty = threadIdx.y;
	float temp = 0.0f;
	for (int phase=0; phase < K/TILE_WIDTH; ++phase){
	if (row < M && phase*TILE_WIDTH+tx < K)
	shmem_A[ty][tx] = A[rowK + phaseTILE_WIDTH + tx];
	else
	shmem_A[ty][tx] = 0.0f;
	if (col < N && (ty + phase*TILE_WIDTH) < K)
	shmem_B[ty][tx] = B[(ty + phaseTILE_WIDTH)N + col];
	else
	shmem_B[ty][tx] = 0.0f;

	__syncthreads();
	for (int i=0; i < TILE_WIDTH; ++i){
	temp += shmem_A[ty][i]*shmem_B[i][tx];
	}
	__syncthreads();
	}
	if (row < M && col < N)
	C[row*N + col] = temp;
	}

	template<typename T>
	void matmul_golden(const T* A,
	const T* B,
	T* C,
	int M,
	int N,
	int K){

	for(int row=0; row<M; ++row){
	for (int col=0; col<N; ++col){
	float t_val = 0.0f;
	for (int k=0; k<K; ++k){
	t_val += A[rowK + k] B[k*K + col];
	}
	C[row*N + col] = static_cast<T>(t_val);
	}
	}
	}

	// MxK KxN -> MxN
	int main(){
	float A, B, C, gC;
	int M = 4096;
	int N = 2045;
	int K = 1024;

	cudaMallocManaged(&A, MKsizeof(float));
	cudaMallocManaged(&B, KNsizeof(float));
	cudaMallocManaged(&C, MNsizeof(float));
	cudaMallocManaged(&gC, MNsizeof(float));

	for (int i=0; i < M*K; ++i)
	A[i] = i;
	for (int i=0; i < K*N; ++i)
	B[i] = 2;
	for (int i=0; i < M*N; ++i)
	{
	C[i] = 0;
	gC[i] = 0;
	}

	matmul_golden(A, B, gC, M, N, K);
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	dim3 num_threads(TILE_WIDTH, TILE_WIDTH,1);
	dim3 num_blocks((N-1)/TILE_WIDTH + 1, (M-1)/TILE_WIDTH + 1, 1);
	//matmul_d<<<num_blocks, num_threads>>>(A, B, C, M, N, K);
	cudaEventRecord(start);
	matmul_d<<<num_blocks, num_threads>>>(A, B, C, M, N, K);
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);

	float milliseconds = 0;
	cudaEventElapsedTime(&milliseconds, start, stop);
	float maxError = 0.0f;
	int idx = 0;
	for(int i=0; i < M*N; ++i){
	if (maxError < abs(C[i] - gC[i])){
	maxError = abs(C[i] - gC[i]);
	idx = i;
	}
	}
	cudaFree(A);
	cudaFree(B);
	cudaFree(C);
	cudaFree(gC);
	printf("Max error: %f idx: %d\n", maxError, idx);
	printf("Effective time: %.3f ms\n", milliseconds);
	float flops = 2 * (float)M* (float)K* (float)N;
	printf("Effective Flops: %.3f GFlops\n", flops/milliseconds/1e+6);
	}