This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Floating Point 4x4 Matrix Multiplication */ | |
.global _start | |
_start: | |
LDR R0, =matrix0 | |
LDR R1, =matrix1 | |
LDR R2, =matrix2 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <chrono> | |
#define BLOCK_SIZE 256 | |
inline void gpuAssert(cudaError_t err, const char *file, int line) | |
{ | |
if (err != cudaSuccess){ | |
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line); | |
exit(EXIT_FAILURE); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda.h> | |
#include <stdio.h> | |
#define BLOCK_SIZE 32 | |
#define NUM_REPS 100 | |
inline void gpuAssert(cudaError_t err, const char *file, int line) | |
{ | |
if (err != cudaSuccess){ | |
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line); | |
exit(EXIT_FAILURE); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cooperative_groups.h> | |
#include <algorithm> | |
#include <cuda.h> | |
#include<stdio.h> | |
using namespace cooperative_groups; | |
inline void gpuAssert(cudaError_t err, const char *file, int line) | |
{ | |
if (err != cudaSuccess){ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class DSU{ | |
std::vector<int> parent; | |
std::vector<int> rank; | |
public: | |
DSU(int N) : parent(N,0), rank(N,0) | |
{ | |
for (int i=0; i<N; ++i){ | |
parent[i] = i; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
n = 1024 | |
x = torch.randn(n,n, device='cuda') | |
y = torch.randn(n,n, device='cuda') | |
num_iter = 100 | |
# Warmup | |
for _ in range(num_iter): | |
z = torch.matmul(x,y) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <chrono> | |
#include <iostream> | |
#define BLOCK_SIZE 128 | |
inline void gpuAssert(cudaError_t err, const char *file, int line) | |
{ | |
if (err != cudaSuccess){ | |
printf("%s in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <iostream> | |
#include <chrono> | |
#define BLOCK_SIZE 256 | |
#define GRID_SIZE 72 //Turing Titan RTX | |
#define OUT_SIZE 256 | |
inline void gpuAssert(cudaError_t err, const char *file, int line) | |
{ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <iostream> | |
#include <chrono> | |
#define BLOCK_SIZE 16 | |
#define GRID_SIZE 72 //Turing Titan RTX | |
#define OUT_SIZE 256 | |
__global__ | |
void histo_d(float* img, int height, int width, int *out, int out_size){ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <stdio.h> | |
#define TILE_WIDTH 32 | |
__global__ | |
void matmul_d(float* A, float* B, float* C, int M, int N, int K){ | |
__shared__ float shmem_A[TILE_WIDTH][TILE_WIDTH] ; | |
__shared__ float shmem_B[TILE_WIDTH][TILE_WIDTH] ; | |
int row = blockIdx.y*blockDim.y + threadIdx.y; |