ita9naiwa · August 27, 2025 08:35
diff --git a/gistfile1.txt b/gistfile1.txt
 #!/usr/bin/env python3

 import torch
 import triton
 import triton.language as tl

 @triton.jit
 def scaled_dot_kernel(
    # Pointers to matrices
    a_ptr, b_ptr, output_ptr,
    # Scale pointers
    a_scale, b_scale,
    # Matrix dimensions
    M, N, K,
    # Strides
    stride_scale: tl.constexpr,
    stride_am, stride_ak,
    stride_bk, stride_bn,
    stride_cm, stride_cn,
    # Meta-parameters
    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
 ):
    """
    This kernel is based on the actual mxfp_matmul from test_matmul.py
    Uses tt.dot_scaled which should trigger ttg.local_load with f8E5M2 types
    """
    pid = tl.program_id(axis=0)
    num_pid_m = tl.cdiv(M, BLOCK_M)
    pid_m = pid % num_pid_m
    pid_n = pid // num_pid_m
    offs_am = (pid_m * BLOCK_M + tl.arange(0, BLOCK_M)) % M
    offs_bn = (pid_n * BLOCK_N + tl.arange(0, BLOCK_N)) % N
    offs_k = tl.arange(0, BLOCK_K)
    offs_scale_k = tl.arange(0, BLOCK_K // 32)

    # Scale pointers - exact pattern from mxfp_matmul
    a_scale_ptr = a_scale + offs_am[:, None] * stride_scale + offs_scale_k[None, :]
    b_scale_ptr = b_scale + offs_bn[:, None] * stride_scale + offs_scale_k[None, :]

    # Matrix pointers
    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=output_ptr.dtype.element_ty)

    # Main loop - this should trigger ttg.local_load operations
    for k in range(0, tl.cdiv(K, BLOCK_K)):
        a = tl.load(a_ptrs)
        b = tl.load(b_ptrs)
        scale_a = tl.load(a_scale_ptr)
        scale_b = tl.load(b_scale_ptr)

        # This is the key operation that should generate ttg.local_load with f8E5M2
        accumulator = tl.dot_scaled(a, scale_a, "e5m2", b, scale_b, "e5m2", accumulator)

        a_ptrs += BLOCK_K * stride_ak
        b_ptrs += BLOCK_K * stride_bk
        a_scale_ptr += BLOCK_K // 32
        b_scale_ptr += BLOCK_K // 32

    offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    output_ptrs = output_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
    tl.store(output_ptrs, accumulator, mask=c_mask)

 def main():
    # Set up matrices - same as test_fp8_matmul.py
    M, N, K = 64, 64, 64
    device = torch.cuda.current_device()

    # Create f8E5M2 inputs - same as test_fp8_matmul.py
    a = torch.randn((M, K), dtype=torch.float16, device=device).to(torch.float8_e5m2)
    b = torch.randn((K, N), dtype=torch.float16, device=device).to(torch.float8_e5m2)
    c = torch.empty((M, N), dtype=torch.float32, device=device)

    # Create scale tensors (i8) - similar to the mxfp pattern
    # Scales are per 32-element group in K dimension
    scale_groups_k = (K + 31) // 32
    a_scale = torch.randint(64, 130, (M, scale_groups_k), dtype=torch.uint8, device=device)
    b_scale = torch.randint(64, 130, (N, scale_groups_k), dtype=torch.uint8, device=device)

    print(f"Input shapes: A={a.shape}, B={b.shape}, C={c.shape}")
    print(f"Scale shapes: A_scale={a_scale.shape}, B_scale={b_scale.shape}")
    print(f"Input dtypes: A={a.dtype}, B={b.dtype}")

    # Grid configuration - same as test_fp8_matmul.py
    BLOCK_M, BLOCK_N, BLOCK_K = 32, 32, 32
    def grid(META):
        return (triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N), )

    try:
        # Launch kernel - this should trigger the ttg.local_load path that was failing
        scaled_dot_kernel[grid](
            a, b, c,
            a_scale, b_scale,
            M, N, K,
            a_scale.stride(0),  # stride_scale
            a.stride(0), a.stride(1),
            b.stride(0), b.stride(1),
            c.stride(0), c.stride(1),
            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K,
            num_warps=4,
            num_ctas=2
        )

        print("✅ Scaled dot kernel completed successfully!")
        print(f"Result shape: {c.shape}")
        print(f"Result dtype: {c.dtype}")
        print(f"Result range: [{c.min():.3f}, {c.max():.3f}]")

    except Exception as e:
        print(f"❌ Scaled dot kernel failed: {e}")
        import traceback
        traceback.print_exc()

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	import torch
	import triton
	import triton.language as tl

	@triton.jit
	def scaled_dot_kernel(
	# Pointers to matrices
	a_ptr, b_ptr, output_ptr,
	# Scale pointers
	a_scale, b_scale,
	# Matrix dimensions
	M, N, K,
	# Strides
	stride_scale: tl.constexpr,
	stride_am, stride_ak,
	stride_bk, stride_bn,
	stride_cm, stride_cn,
	# Meta-parameters
	BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
	):
	"""
	This kernel is based on the actual mxfp_matmul from test_matmul.py
	Uses tt.dot_scaled which should trigger ttg.local_load with f8E5M2 types
	"""
	pid = tl.program_id(axis=0)
	num_pid_m = tl.cdiv(M, BLOCK_M)
	pid_m = pid % num_pid_m
	pid_n = pid // num_pid_m
	offs_am = (pid_m * BLOCK_M + tl.arange(0, BLOCK_M)) % M
	offs_bn = (pid_n * BLOCK_N + tl.arange(0, BLOCK_N)) % N
	offs_k = tl.arange(0, BLOCK_K)
	offs_scale_k = tl.arange(0, BLOCK_K // 32)

	# Scale pointers - exact pattern from mxfp_matmul
	a_scale_ptr = a_scale + offs_am[:, None] * stride_scale + offs_scale_k[None, :]
	b_scale_ptr = b_scale + offs_bn[:, None] * stride_scale + offs_scale_k[None, :]

	# Matrix pointers
	a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
	b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
	accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=output_ptr.dtype.element_ty)

	# Main loop - this should trigger ttg.local_load operations
	for k in range(0, tl.cdiv(K, BLOCK_K)):
	a = tl.load(a_ptrs)
	b = tl.load(b_ptrs)
	scale_a = tl.load(a_scale_ptr)
	scale_b = tl.load(b_scale_ptr)

	# This is the key operation that should generate ttg.local_load with f8E5M2
	accumulator = tl.dot_scaled(a, scale_a, "e5m2", b, scale_b, "e5m2", accumulator)

	a_ptrs += BLOCK_K * stride_ak
	b_ptrs += BLOCK_K * stride_bk
	a_scale_ptr += BLOCK_K // 32
	b_scale_ptr += BLOCK_K // 32

	offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
	offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
	output_ptrs = output_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
	c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
	tl.store(output_ptrs, accumulator, mask=c_mask)

	def main():
	# Set up matrices - same as test_fp8_matmul.py
	M, N, K = 64, 64, 64
	device = torch.cuda.current_device()

	# Create f8E5M2 inputs - same as test_fp8_matmul.py
	a = torch.randn((M, K), dtype=torch.float16, device=device).to(torch.float8_e5m2)
	b = torch.randn((K, N), dtype=torch.float16, device=device).to(torch.float8_e5m2)
	c = torch.empty((M, N), dtype=torch.float32, device=device)

	# Create scale tensors (i8) - similar to the mxfp pattern
	# Scales are per 32-element group in K dimension
	scale_groups_k = (K + 31) // 32
	a_scale = torch.randint(64, 130, (M, scale_groups_k), dtype=torch.uint8, device=device)
	b_scale = torch.randint(64, 130, (N, scale_groups_k), dtype=torch.uint8, device=device)

	print(f"Input shapes: A={a.shape}, B={b.shape}, C={c.shape}")
	print(f"Scale shapes: A_scale={a_scale.shape}, B_scale={b_scale.shape}")
	print(f"Input dtypes: A={a.dtype}, B={b.dtype}")

	# Grid configuration - same as test_fp8_matmul.py
	BLOCK_M, BLOCK_N, BLOCK_K = 32, 32, 32
	def grid(META):
	return (triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N), )

	try:
	# Launch kernel - this should trigger the ttg.local_load path that was failing
	scaled_dot_kernel[grid](
	a, b, c,
	a_scale, b_scale,
	M, N, K,
	a_scale.stride(0), # stride_scale
	a.stride(0), a.stride(1),
	b.stride(0), b.stride(1),
	c.stride(0), c.stride(1),
	BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K,
	num_warps=4,
	num_ctas=2
	)

	print("✅ Scaled dot kernel completed successfully!")
	print(f"Result shape: {c.shape}")
	print(f"Result dtype: {c.dtype}")
	print(f"Result range: [{c.min():.3f}, {c.max():.3f}]")

	except Exception as e:
	print(f"❌ Scaled dot kernel failed: {e}")
	import traceback
	traceback.print_exc()

	if __name__ == "__main__":
	main()
No results found