This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
baseline runtime(s) 0.5243263244628906 | |
with reshape runtime (s) 0.0022399425506591797 | |
@ cpu | |
========= | |
baseline runtime (s) 0.25386476516723633 | |
with reshape runtime (s) 0.0008966922760009766 | |
@ cuda:0 | |
""" | |
import torch |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// https://marketplace.visualstudio.com/items?itemName=coolchyni.beyond-debug | |
{ | |
"inputs": [ | |
{ | |
"id": "hostname", | |
"description": "xxx", | |
"default": "localhost", | |
"type": "promptString" | |
}, | |
{ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python single_gpu_ddp.py | |
# https://discuss.pytorch.org/t/single-machine-single-gpu-distributed-best-practices/169243 | |
import torch | |
import torch.distributed as dist | |
import torch.nn as nn | |
import torch.multiprocessing as mp | |
from torch.nn.parallel import DistributedDataParallel as DDP | |
import os | |
def setup(rank, world_size): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
IR module { | |
tt.func public @matmul_kernel_0d1d2d3d4c5d6c7d8c(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} { | |
%c16_i32 = arith.constant 16 : i32 | |
%c1024_i32 = arith.constant 1024 : i32 | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<16> : tensor<16x16xi32> | |
%cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32> | |
%0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> | |
%1 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32> | |
%2 = tt.splat %arg3 : (i32) -> tensor<16x1xi32> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import triton | |
import triton.language as tl | |
import torch.nn.functional as F | |
@triton.jit | |
def matmul_kernel( | |
a_ptr, b_ptr, c_ptr, | |
stride_am, stride_ak, | |
stride_bk, stride_bn, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import triton | |
import triton.language as tl | |
import torch.nn.functional as F | |
import time | |
@triton.jit | |
def add_kernel(x_ptr, y_ptr, output_ptr, N, | |
BLOCK_SIZE: tl.constexpr): | |
pid = tl.program_id(0) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
# acknowledgement: https://gist.github.com/bwasti/7e4cb9bd1aaddeb09bd360b570a486b1 | |
def cudagraph(f): | |
_graphs = {} | |
def f_(*args): | |
key = hash(tuple(tuple(a.shape) for a in args)) | |
if key in _graphs: | |
wrapped, *_ = _graphs[key] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Vector2D { | |
private: | |
vector<vector<int>>::iterator row, iBegin, iEnd; | |
vector<int>::iterator col; | |
public: | |
Vector2D(vector<vector<int>>& vec2d) { | |
iBegin = row = vec2d.begin(); | |
iEnd = vec2d.end(); | |
if(vec2d.size()) | |
col = row->begin(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<string> | |
#include<iostream> | |
#include<vector> | |
using namespace std; | |
class ShortPalindromes{ | |
public: | |
string solve(const string & s, int i, int j, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Definition for an interval. | |
* struct Interval { | |
* int start; | |
* int end; | |
* Interval() : start(0), end(0) {} | |
* Interval(int s, int e) : start(s), end(e) {} | |
* }; | |
*/ |
NewerOlder