Created
August 27, 2025 08:35
-
-
Save ita9naiwa/644f9a025d08361b4d1c6412a67e9346 to your computer and use it in GitHub Desktop.
test_scaled.dot.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import torch | |
import triton | |
import triton.language as tl | |
@triton.jit | |
def scaled_dot_kernel( | |
# Pointers to matrices | |
a_ptr, b_ptr, output_ptr, | |
# Scale pointers | |
a_scale, b_scale, | |
# Matrix dimensions | |
M, N, K, | |
# Strides | |
stride_scale: tl.constexpr, | |
stride_am, stride_ak, | |
stride_bk, stride_bn, | |
stride_cm, stride_cn, | |
# Meta-parameters | |
BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, | |
): | |
""" | |
This kernel is based on the actual mxfp_matmul from test_matmul.py | |
Uses tt.dot_scaled which should trigger ttg.local_load with f8E5M2 types | |
""" | |
pid = tl.program_id(axis=0) | |
num_pid_m = tl.cdiv(M, BLOCK_M) | |
pid_m = pid % num_pid_m | |
pid_n = pid // num_pid_m | |
offs_am = (pid_m * BLOCK_M + tl.arange(0, BLOCK_M)) % M | |
offs_bn = (pid_n * BLOCK_N + tl.arange(0, BLOCK_N)) % N | |
offs_k = tl.arange(0, BLOCK_K) | |
offs_scale_k = tl.arange(0, BLOCK_K // 32) | |
# Scale pointers - exact pattern from mxfp_matmul | |
a_scale_ptr = a_scale + offs_am[:, None] * stride_scale + offs_scale_k[None, :] | |
b_scale_ptr = b_scale + offs_bn[:, None] * stride_scale + offs_scale_k[None, :] | |
# Matrix pointers | |
a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) | |
b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) | |
accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=output_ptr.dtype.element_ty) | |
# Main loop - this should trigger ttg.local_load operations | |
for k in range(0, tl.cdiv(K, BLOCK_K)): | |
a = tl.load(a_ptrs) | |
b = tl.load(b_ptrs) | |
scale_a = tl.load(a_scale_ptr) | |
scale_b = tl.load(b_scale_ptr) | |
# This is the key operation that should generate ttg.local_load with f8E5M2 | |
accumulator = tl.dot_scaled(a, scale_a, "e5m2", b, scale_b, "e5m2", accumulator) | |
a_ptrs += BLOCK_K * stride_ak | |
b_ptrs += BLOCK_K * stride_bk | |
a_scale_ptr += BLOCK_K // 32 | |
b_scale_ptr += BLOCK_K // 32 | |
offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) | |
offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) | |
output_ptrs = output_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] | |
c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) | |
tl.store(output_ptrs, accumulator, mask=c_mask) | |
def main(): | |
# Set up matrices - same as test_fp8_matmul.py | |
M, N, K = 64, 64, 64 | |
device = torch.cuda.current_device() | |
# Create f8E5M2 inputs - same as test_fp8_matmul.py | |
a = torch.randn((M, K), dtype=torch.float16, device=device).to(torch.float8_e5m2) | |
b = torch.randn((K, N), dtype=torch.float16, device=device).to(torch.float8_e5m2) | |
c = torch.empty((M, N), dtype=torch.float32, device=device) | |
# Create scale tensors (i8) - similar to the mxfp pattern | |
# Scales are per 32-element group in K dimension | |
scale_groups_k = (K + 31) // 32 | |
a_scale = torch.randint(64, 130, (M, scale_groups_k), dtype=torch.uint8, device=device) | |
b_scale = torch.randint(64, 130, (N, scale_groups_k), dtype=torch.uint8, device=device) | |
print(f"Input shapes: A={a.shape}, B={b.shape}, C={c.shape}") | |
print(f"Scale shapes: A_scale={a_scale.shape}, B_scale={b_scale.shape}") | |
print(f"Input dtypes: A={a.dtype}, B={b.dtype}") | |
# Grid configuration - same as test_fp8_matmul.py | |
BLOCK_M, BLOCK_N, BLOCK_K = 32, 32, 32 | |
def grid(META): | |
return (triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N), ) | |
try: | |
# Launch kernel - this should trigger the ttg.local_load path that was failing | |
scaled_dot_kernel[grid]( | |
a, b, c, | |
a_scale, b_scale, | |
M, N, K, | |
a_scale.stride(0), # stride_scale | |
a.stride(0), a.stride(1), | |
b.stride(0), b.stride(1), | |
c.stride(0), c.stride(1), | |
BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, | |
num_warps=4, | |
num_ctas=2 | |
) | |
print("✅ Scaled dot kernel completed successfully!") | |
print(f"Result shape: {c.shape}") | |
print(f"Result dtype: {c.dtype}") | |
print(f"Result range: [{c.min():.3f}, {c.max():.3f}]") | |
except Exception as e: | |
print(f"❌ Scaled dot kernel failed: {e}") | |
import traceback | |
traceback.print_exc() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment