davidberard98’s gists

davidberard98 / 5070-03-matrix-multiplication-MODIFIED-FP8.txt

Last active May 9, 2025 01:29

	triton_output_with_fp16_inputs=tensor([[-17.7500, 12.5938, -28.2500, ..., -23.9531, 2.9141, -9.3359],
	[ 5.3750, -24.9844, 7.1016, ..., 2.7383, -42.6562, 1.9766],
	[-22.8906, -5.9766, -8.2031, ..., -0.2485, -41.5312, 19.0938],
	...,
	[ 5.4648, -2.0977, 18.4531, ..., -36.9688, -7.6680, -20.1719],
	[ -9.2031, -12.2812, -20.5312, ..., -24.5625, -50.9062, -3.6387],
	[ 44.2188, -7.1328, -28.3750, ..., 4.6914, 7.9648, -8.6641]],
	device='cuda:0', dtype=torch.float16)
	torch_output_with_fp16_inputs=tensor([[-17.7500, 12.5938, -28.2500, ..., -23.9531, 2.9141, -9.3359],
	[ 5.3750, -24.9844, 7.1016, ..., 2.7383, -42.6562, 1.9766],

davidberard98 / bucketize_compile.py

Last active May 1, 2025 23:39

	import torch
	import triton


	def fn(x, y, buckets):
	buckets = torch.bucketize(x, buckets)
	return buckets[:, None] * y[None, :]


	lengths = torch.randint(512, (1024,), device="cuda", dtype=torch.int32)

davidberard98 / repro_cache_issue_ws.sh

Last active April 2, 2025 01:10

	cat > cache_bc_issue.py << EOF
	import torch

	torch._inductor.config.autotune_remote_cache = False

	def fn(x, y, z):
	return torch.mm((x + y).relu(), z)

	x, y, z = [torch.rand((8, 8), device="cuda") for _ in range(3)]

davidberard98 / tma_inductor.log

Created March 11, 2025 02:00

	========= COMPUTE-SANITIZER
	/home/dberard/local/triton-env2/pytorch/torch/backends/cudnn/__init__.py:108: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system.
	warnings.warn(
	test_triton_kernel_tma_descriptor_1d_dynamic_False_cuda (__main__.AOTInductorTestABICompatibleGpu) ... /home/dberard/local/triton-env2/pytorch/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /home/dberard/local/triton-env2/pytorch/aten/src/ATen/Context.cpp:148.)
	torch._C._set_onednn_allow_tf32(_allow_tf32)
	W0310 18:32:06.819000 1957237 torch/_export/__init__.py:67] +============================+
	W0310 18:32:06.820000 1957237 torch/_export/__init__.py:68] \| !!! WARNING !!! \|
	W0310 18:32:06.820000 1957237 torch/_export/__init__.py:69] +============================+
	W0310 18:3

davidberard98 / tmp_best_config_dir.txt

Created February 28, 2025 21:34

	/tmp/tmp2y2feojh
	├── 3o
	│ └── c3odnhm3pars7ebyxkus2ybx6rii73pwrehcfasyw3n52zcfdhxn.py
	├── aotautograd
	│ └── a55zca5t57g6gulzwt7ta6wl3gsye3uzkhep3fp32tqb6q2svp75
	│ └── entry
	├── fxgraph
	│ └── 7t
	│ └── f7troxzer6vsa6la5gf46enkag5awxo4xbg3sk6ir6fhy73idhe2
	│ └── jlzhkiokhghlf7w2z7szw46i4fbwr5uw4wevn3d6gion3wkzqo3

davidberard98 / udtk.py

Created January 24, 2025 17:45

	import torch
	import triton
	import triton.language as tl

	@triton.jit
	def kernel(in_ptr, out_ptr, BLOCK_SIZE: tl.constexpr, STRING_CONSTEXPR: tl.constexpr):
	offsets = tl.arange(0, BLOCK_SIZE)
	data = tl.load(in_ptr + offsets)
	if STRING_CONSTEXPR == "sin":
	data = data.sin()

davidberard98 / after.ttgir

Created January 16, 2025 19:52

	#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}>
	#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
	#blocked2 = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
	#loc = loc("/home/dberard/local/fbsource/fbcode/scripts/dberard/matmul_sparse/matmul_sparse.py":157:0)
	#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 256, 16]}>
	#shared = #ttg.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}>
	#shared1 = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}>
	#shared2 = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}>
	#smem = #ttg.shared_memory
	module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {

davidberard98 / before.ttgir

Created January 16, 2025 19:52

	#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}>
	#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
	#blocked2 = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
	#loc = loc("/home/dberard/local/fbsource/fbcode/scripts/dberard/matmul_sparse/matmul_sparse.py":157:0)
	#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 256, 16]}>
	#shared = #ttg.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}>
	#shared1 = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}>
	#shared2 = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}>
	#smem = #ttg.shared_memory
	module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {

davidberard98 / matmul_sparse.py

Created January 16, 2025 17:56

	# Original kernel from Chao Xu (Dustinpro) and Yuanwei Fang (fywkevin)
	#
	# Triton official: https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
	# scatter2scatter: https://github.com/shawntan/scattermoe/blob/main/scattermoe/kernels/ops.py#L58
	# OpenAI sparse-autoencoder: https://github.com/openai/sparse_autoencoder/blob/main/sparse_autoencoder/kernels.py#L220
	# Apple sparse-CCE: https://github.com/apple/ml-cross-entropy/blob/e43af99cb21ea27e4afe0c90c04e66f9abfd47c6/cut_cross_entropy/cce_lse_forward.py#L26
	# Sparse Toolkit: https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/triton_kernels.py#L34
	import torch

	# @manual=//triton:triton

davidberard98 / torchao_bf16xint4.py

Last active December 18, 2024 19:08

	from triton.testing import do_bench
	import torch

	class MyModule(torch.nn.Module):
	def __init__(self):
	super().__init__()
	self.lin = torch.nn.Linear(214, 214).to(torch.bfloat16)

	def forward(self, x):
	return self.lin(x)

David Berard davidberard98