This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
triton_output_with_fp16_inputs=tensor([[-17.7500, 12.5938, -28.2500, ..., -23.9531, 2.9141, -9.3359], | |
[ 5.3750, -24.9844, 7.1016, ..., 2.7383, -42.6562, 1.9766], | |
[-22.8906, -5.9766, -8.2031, ..., -0.2485, -41.5312, 19.0938], | |
..., | |
[ 5.4648, -2.0977, 18.4531, ..., -36.9688, -7.6680, -20.1719], | |
[ -9.2031, -12.2812, -20.5312, ..., -24.5625, -50.9062, -3.6387], | |
[ 44.2188, -7.1328, -28.3750, ..., 4.6914, 7.9648, -8.6641]], | |
device='cuda:0', dtype=torch.float16) | |
torch_output_with_fp16_inputs=tensor([[-17.7500, 12.5938, -28.2500, ..., -23.9531, 2.9141, -9.3359], | |
[ 5.3750, -24.9844, 7.1016, ..., 2.7383, -42.6562, 1.9766], |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import triton | |
def fn(x, y, buckets): | |
buckets = torch.bucketize(x, buckets) | |
return buckets[:, None] * y[None, :] | |
lengths = torch.randint(512, (1024,), device="cuda", dtype=torch.int32) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cat > cache_bc_issue.py << EOF | |
import torch | |
torch._inductor.config.autotune_remote_cache = False | |
def fn(x, y, z): | |
return torch.mm((x + y).relu(), z) | |
x, y, z = [torch.rand((8, 8), device="cuda") for _ in range(3)] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
========= COMPUTE-SANITIZER | |
/home/dberard/local/triton-env2/pytorch/torch/backends/cudnn/__init__.py:108: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system. | |
warnings.warn( | |
test_triton_kernel_tma_descriptor_1d_dynamic_False_cuda (__main__.AOTInductorTestABICompatibleGpu) ... /home/dberard/local/triton-env2/pytorch/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /home/dberard/local/triton-env2/pytorch/aten/src/ATen/Context.cpp:148.) | |
torch._C._set_onednn_allow_tf32(_allow_tf32) | |
W0310 18:32:06.819000 1957237 torch/_export/__init__.py:67] +============================+ | |
W0310 18:32:06.820000 1957237 torch/_export/__init__.py:68] | !!! WARNING !!! | | |
W0310 18:32:06.820000 1957237 torch/_export/__init__.py:69] +============================+ | |
W0310 18:3 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/tmp/tmp2y2feojh | |
├── 3o | |
│ └── c3odnhm3pars7ebyxkus2ybx6rii73pwrehcfasyw3n52zcfdhxn.py | |
├── aotautograd | |
│ └── a55zca5t57g6gulzwt7ta6wl3gsye3uzkhep3fp32tqb6q2svp75 | |
│ └── entry | |
├── fxgraph | |
│ └── 7t | |
│ └── f7troxzer6vsa6la5gf46enkag5awxo4xbg3sk6ir6fhy73idhe2 | |
│ └── jlzhkiokhghlf7w2z7szw46i4fbwr5uw4wevn3d6gion3wkzqo3 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import triton | |
import triton.language as tl | |
@triton.jit | |
def kernel(in_ptr, out_ptr, BLOCK_SIZE: tl.constexpr, STRING_CONSTEXPR: tl.constexpr): | |
offsets = tl.arange(0, BLOCK_SIZE) | |
data = tl.load(in_ptr + offsets) | |
if STRING_CONSTEXPR == "sin": | |
data = data.sin() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}> | |
#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}> | |
#blocked2 = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> | |
#loc = loc("/home/dberard/local/fbsource/fbcode/scripts/dberard/matmul_sparse/matmul_sparse.py":157:0) | |
#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 256, 16]}> | |
#shared = #ttg.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}> | |
#shared1 = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> | |
#shared2 = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> | |
#smem = #ttg.shared_memory | |
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}> | |
#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}> | |
#blocked2 = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> | |
#loc = loc("/home/dberard/local/fbsource/fbcode/scripts/dberard/matmul_sparse/matmul_sparse.py":157:0) | |
#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 256, 16]}> | |
#shared = #ttg.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}> | |
#shared1 = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> | |
#shared2 = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> | |
#smem = #ttg.shared_memory | |
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Original kernel from Chao Xu (Dustinpro) and Yuanwei Fang (fywkevin) | |
# | |
# Triton official: https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html | |
# scatter2scatter: https://github.com/shawntan/scattermoe/blob/main/scattermoe/kernels/ops.py#L58 | |
# OpenAI sparse-autoencoder: https://github.com/openai/sparse_autoencoder/blob/main/sparse_autoencoder/kernels.py#L220 | |
# Apple sparse-CCE: https://github.com/apple/ml-cross-entropy/blob/e43af99cb21ea27e4afe0c90c04e66f9abfd47c6/cut_cross_entropy/cce_lse_forward.py#L26 | |
# Sparse Toolkit: https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/triton_kernels.py#L34 | |
import torch | |
# @manual=//triton:triton |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from triton.testing import do_bench | |
import torch | |
class MyModule(torch.nn.Module): | |
def __init__(self): | |
super().__init__() | |
self.lin = torch.nn.Linear(2**14, 2**14).to(torch.bfloat16) | |
def forward(self, x): | |
return self.lin(x) |