Skip to content

Instantly share code, notes, and snippets.

View sandeepkumar-skb's full-sized avatar
:octocat:
Get comfortable being uncomfortable

Sandeep Kumar Behera sandeepkumar-skb

:octocat:
Get comfortable being uncomfortable
View GitHub Profile
/* Floating Point 4x4 Matrix Multiplication */
.global _start
_start:
LDR R0, =matrix0
LDR R1, =matrix1
LDR R2, =matrix2
#include <iostream>
#include <chrono>
#define BLOCK_SIZE 256
inline void gpuAssert(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess){
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
#include <cuda.h>
#include <stdio.h>
#define BLOCK_SIZE 32
#define NUM_REPS 100
inline void gpuAssert(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess){
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
#include <cooperative_groups.h>
#include <algorithm>
#include <cuda.h>
#include<stdio.h>
using namespace cooperative_groups;
inline void gpuAssert(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess){
class DSU{
std::vector<int> parent;
std::vector<int> rank;
public:
DSU(int N) : parent(N,0), rank(N,0)
{
for (int i=0; i<N; ++i){
parent[i] = i;
}
}
import torch
n = 1024
x = torch.randn(n,n, device='cuda')
y = torch.randn(n,n, device='cuda')
num_iter = 100
# Warmup
for _ in range(num_iter):
z = torch.matmul(x,y)
#include <stdio.h>
#include <chrono>
#include <iostream>
#define BLOCK_SIZE 128
inline void gpuAssert(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess){
printf("%s in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
#include <stdio.h>
#include <iostream>
#include <chrono>
#define BLOCK_SIZE 256
#define GRID_SIZE 72 //Turing Titan RTX
#define OUT_SIZE 256
inline void gpuAssert(cudaError_t err, const char *file, int line)
{
#include <stdio.h>
#include <iostream>
#include <chrono>
#define BLOCK_SIZE 16
#define GRID_SIZE 72 //Turing Titan RTX
#define OUT_SIZE 256
__global__
void histo_d(float* img, int height, int width, int *out, int out_size){
#include <iostream>
#include <stdio.h>
#define TILE_WIDTH 32
__global__
void matmul_d(float* A, float* B, float* C, int M, int N, int K){
__shared__ float shmem_A[TILE_WIDTH][TILE_WIDTH] ;
__shared__ float shmem_B[TILE_WIDTH][TILE_WIDTH] ;
int row = blockIdx.y*blockDim.y + threadIdx.y;