Get comfortable being uncomfortable

Sandeep Kumar Behera sandeepkumar-skb

Get comfortable being uncomfortable

Deep Learning acceleration and computer architecture enthusiast.

sandeepkumar-skb / vectorized_4x4_gemm.s

Created March 28, 2021 02:37 — forked from mkolod/vectorized_4x4_gemm.s

	/* Floating Point 4x4 Matrix Multiplication */

	.global _start

	_start:

	LDR R0, =matrix0
	LDR R1, =matrix1
	LDR R2, =matrix2

sandeepkumar-skb / reduce.cu

Created February 17, 2021 05:22

	#include <iostream>
	#include <chrono>

	#define BLOCK_SIZE 256

	inline void gpuAssert(cudaError_t err, const char *file, int line)
	{
	if (err != cudaSuccess){
	printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
	exit(EXIT_FAILURE);

sandeepkumar-skb / transpose.cu

Created January 30, 2021 04:32

	#include <cuda.h>
	#include <stdio.h>
	#define BLOCK_SIZE 32
	#define NUM_REPS 100

	inline void gpuAssert(cudaError_t err, const char *file, int line)
	{
	if (err != cudaSuccess){
	printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
	exit(EXIT_FAILURE);

sandeepkumar-skb / cooperative_groups.cu

Last active January 29, 2021 23:29

	#include <cooperative_groups.h>
	#include <algorithm>
	#include <cuda.h>
	#include<stdio.h>

	using namespace cooperative_groups;

	inline void gpuAssert(cudaError_t err, const char *file, int line)
	{
	if (err != cudaSuccess){

sandeepkumar-skb / disjoin_set_union.cpp

Created January 5, 2021 21:23

sandeepkumar-skb / torch_cuda_events_demo.py

Created December 21, 2020 22:30

sandeepkumar-skb / pointer_aliasing_demo.cu

Created December 21, 2020 01:36

	#include <stdio.h>
	#include <chrono>
	#include <iostream>

	#define BLOCK_SIZE 128

	inline void gpuAssert(cudaError_t err, const char *file, int line)
	{
	if (err != cudaSuccess){
	printf("%s in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);

sandeepkumar-skb / histogram_shmem.cu

Last active January 30, 2021 04:35

	#include <stdio.h>
	#include <iostream>
	#include <chrono>

	#define BLOCK_SIZE 256
	#define GRID_SIZE 72 //Turing Titan RTX
	#define OUT_SIZE 256

	inline void gpuAssert(cudaError_t err, const char *file, int line)
	{

sandeepkumar-skb / histogram_gmem.cu

Last active December 20, 2020 05:21

	#include <stdio.h>
	#include <iostream>
	#include <chrono>

	#define BLOCK_SIZE 16
	#define GRID_SIZE 72 //Turing Titan RTX
	#define OUT_SIZE 256

	__global__
	void histo_d(float* img, int height, int width, int *out, int out_size){

sandeepkumar-skb / tiled_matmul.cu

Last active January 25, 2021 04:32

	#include <iostream>
	#include <stdio.h>

	#define TILE_WIDTH 32

	__global__
	void matmul_d(float* A, float* B, float* C, int M, int N, int K){
	__shared__ float shmem_A[TILE_WIDTH][TILE_WIDTH] ;
	__shared__ float shmem_B[TILE_WIDTH][TILE_WIDTH] ;
	int row = blockIdx.y*blockDim.y + threadIdx.y;