Skip to content

Instantly share code, notes, and snippets.

View davidberard98's full-sized avatar

David Berard davidberard98

  • PyTorch
  • Menlo Park, CA
View GitHub Profile
triton_output_with_fp16_inputs=tensor([[-17.7500, 12.5938, -28.2500, ..., -23.9531, 2.9141, -9.3359],
[ 5.3750, -24.9844, 7.1016, ..., 2.7383, -42.6562, 1.9766],
[-22.8906, -5.9766, -8.2031, ..., -0.2485, -41.5312, 19.0938],
...,
[ 5.4648, -2.0977, 18.4531, ..., -36.9688, -7.6680, -20.1719],
[ -9.2031, -12.2812, -20.5312, ..., -24.5625, -50.9062, -3.6387],
[ 44.2188, -7.1328, -28.3750, ..., 4.6914, 7.9648, -8.6641]],
device='cuda:0', dtype=torch.float16)
torch_output_with_fp16_inputs=tensor([[-17.7500, 12.5938, -28.2500, ..., -23.9531, 2.9141, -9.3359],
[ 5.3750, -24.9844, 7.1016, ..., 2.7383, -42.6562, 1.9766],
import torch
import triton
def fn(x, y, buckets):
buckets = torch.bucketize(x, buckets)
return buckets[:, None] * y[None, :]
lengths = torch.randint(512, (1024,), device="cuda", dtype=torch.int32)
cat > cache_bc_issue.py << EOF
import torch
torch._inductor.config.autotune_remote_cache = False
def fn(x, y, z):
return torch.mm((x + y).relu(), z)
x, y, z = [torch.rand((8, 8), device="cuda") for _ in range(3)]
========= COMPUTE-SANITIZER
/home/dberard/local/triton-env2/pytorch/torch/backends/cudnn/__init__.py:108: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system.
warnings.warn(
test_triton_kernel_tma_descriptor_1d_dynamic_False_cuda (__main__.AOTInductorTestABICompatibleGpu) ... /home/dberard/local/triton-env2/pytorch/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /home/dberard/local/triton-env2/pytorch/aten/src/ATen/Context.cpp:148.)
torch._C._set_onednn_allow_tf32(_allow_tf32)
W0310 18:32:06.819000 1957237 torch/_export/__init__.py:67] +============================+
W0310 18:32:06.820000 1957237 torch/_export/__init__.py:68] | !!! WARNING !!! |
W0310 18:32:06.820000 1957237 torch/_export/__init__.py:69] +============================+
W0310 18:3
/tmp/tmp2y2feojh
├── 3o
│   └── c3odnhm3pars7ebyxkus2ybx6rii73pwrehcfasyw3n52zcfdhxn.py
├── aotautograd
│   └── a55zca5t57g6gulzwt7ta6wl3gsye3uzkhep3fp32tqb6q2svp75
│   └── entry
├── fxgraph
│   └── 7t
│   └── f7troxzer6vsa6la5gf46enkag5awxo4xbg3sk6ir6fhy73idhe2
│   └── jlzhkiokhghlf7w2z7szw46i4fbwr5uw4wevn3d6gion3wkzqo3
import torch
import triton
import triton.language as tl
@triton.jit
def kernel(in_ptr, out_ptr, BLOCK_SIZE: tl.constexpr, STRING_CONSTEXPR: tl.constexpr):
offsets = tl.arange(0, BLOCK_SIZE)
data = tl.load(in_ptr + offsets)
if STRING_CONSTEXPR == "sin":
data = data.sin()
#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}>
#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
#blocked2 = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
#loc = loc("/home/dberard/local/fbsource/fbcode/scripts/dberard/matmul_sparse/matmul_sparse.py":157:0)
#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 256, 16]}>
#shared = #ttg.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}>
#shared1 = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}>
#shared2 = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}>
#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
#blocked2 = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
#loc = loc("/home/dberard/local/fbsource/fbcode/scripts/dberard/matmul_sparse/matmul_sparse.py":157:0)
#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 256, 16]}>
#shared = #ttg.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}>
#shared1 = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}>
#shared2 = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
# Original kernel from Chao Xu (Dustinpro) and Yuanwei Fang (fywkevin)
#
# Triton official: https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
# scatter2scatter: https://github.com/shawntan/scattermoe/blob/main/scattermoe/kernels/ops.py#L58
# OpenAI sparse-autoencoder: https://github.com/openai/sparse_autoencoder/blob/main/sparse_autoencoder/kernels.py#L220
# Apple sparse-CCE: https://github.com/apple/ml-cross-entropy/blob/e43af99cb21ea27e4afe0c90c04e66f9abfd47c6/cut_cross_entropy/cce_lse_forward.py#L26
# Sparse Toolkit: https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/triton_kernels.py#L34
import torch
# @manual=//triton:triton
from triton.testing import do_bench
import torch
class MyModule(torch.nn.Module):
def __init__(self):
super().__init__()
self.lin = torch.nn.Linear(2**14, 2**14).to(torch.bfloat16)
def forward(self, x):
return self.lin(x)