makslevental’s gists

makslevental / gist:7278ae85fcb2d78d55bef09b78f29650

Created April 14, 2025 18:55

	W20250414 14:53:24.356304 132475353552832 metadata.cpp:186] rocprofiler_iterate_agent_supported_counters returned ROCPROFILER_STATUS_ERROR_AGENT_ARCH_NOT_SUPPORTED for agent 1 (gfx1150) :: Agent HW architecture is not supported, no counter metrics found.
	W20250414 14:53:24.651769 131105783740352 metadata.cpp:186] rocprofiler_iterate_agent_supported_counters returned ROCPROFILER_STATUS_ERROR_AGENT_ARCH_NOT_SUPPORTED for agent 1 (gfx1150) :: Agent HW architecture is not supported, no counter metrics found.
	F20250414 14:53:24.970933 131105783740352 agent.cpp:1069] rocprofiler was only able to map 2 rocprofiler agents to HSA agents, expected 3
	@ 0x773d72612360 (unknown)
	@ 0x773d72613297 (unknown)
	@ 0x773d7205d18a (unknown)
	@ 0x773d72098ed5 rocprofiler_set_api_table
	@ 0x773d71ad5afc rocprofiler_register_library_api_table
	@ 0x773d660a2229 (unknown)
	@ 0x773d660a3c58 (unknown)

makslevental / results_31938.csv

Created April 14, 2025 14:02

	Dispatch_ID	GPU_ID	Queue_ID	PID	TID	Grid_Size	Workgroup_Size	LDS_Per_Workgroup	Scratch_Per_Workitem	Arch_VGPR	Accum_VGPR	SGPR	Wave_Size	Kernel_Name	Start_Timestamp	End_Timestamp	Correlation_ID
	0	1	1	31938	31938	32	32	0	0	40	0	128	32	smol_matmul.kd	5382059929644	5382059931804	0

makslevental / amd_test_persistent_matmul.py

Created April 13, 2025 19:23

	# RUN: python3 %s \| FileCheck %s

	import triton
	import triton.language as tl

	from triton.backends.compiler import GPUTarget


	def print_test_name_and_run(f):
	print(f"Test: {f.__name__}")

makslevental / persistent_kernel.mlir

Created April 4, 2025 19:34

	module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
	tt.func public @matmul_kernel_persistent(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
	%cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #ttg.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 4], instrShape = [32, 32], isTransposed = true}>>
	%true = arith.constant true
	%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x256xf16, #ttg.blocked<{sizePerThread = [8, 1], thre

makslevental / annotate_expected_remarks.py

Created April 4, 2025 08:23

	import re
	import subprocess
	import sys

	from black.trans import defaultdict

	test_file = "/home/mlevental/dev_projects/triton/test/TritonGPU/amd/amd-range-analysis.mlir"

	cmnd = [
	"/home/mlevental/dev_projects/llvm-project/cmake-build-debug/bin/triton-opt",

makslevental / log.txt

Created April 3, 2025 18:08

	test.mlir:4:13: remark: unsigned : [1, 1] signed : [-1, -1]
	%true = arith.constant true
	^
	test.mlir:6:14: remark: unsigned : [0, 0] signed : [0, 0]
	%cst_1 = arith.constant dense<0> : tensor<256xi32, #ttg.slice<{dim = 0, parent = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 8], order = [0, 1]}>}>>
	^
	test.mlir:6:14: remark: non-neg
	%cst_1 = arith.constant dense<0> : tensor<256xi32, #ttg.slice<{dim = 0, parent = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 8], order = [0, 1]}>}>>
	^
	test.mlir:8:15: remark: unsigned : [1, 1] signed : [1, 1]

makslevental / test_complex_matmul.py

Created April 2, 2025 20:26

makslevental / matmul_kernel.py

Created April 2, 2025 20:01

	import triton
	import triton.language as tl


	@triton.jit
	def matmul_kernel(a_ptr, b_ptr, c_ptr, bias_ptr, M, N, K, stride_am, stride_ak, stride_bk, stride_bn, stride_cm,
	stride_cn, stride_bias, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
	BLOCK_SIZE_K: tl.constexpr, SPLIT_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, BIAS: tl.constexpr,
	EVEN_K: tl.constexpr, GRID_MN: tl.constexpr, NUM_XCDS: tl.constexpr):

makslevental / matmul_skipIf.ttgir

Last active April 2, 2025 18:39

	#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
	#blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
	#mma = #ttg.amd_wmma<{version = 1, isTranspose = false, warpsPerCTA = [2, 2]}>
	#shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [1, 0]}>
	#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
	#smem = #ttg.shared_memory
	module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx1100", "ttg.threads-per-warp" = 32 : i32} {
	tt.func public @matmul_kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility =

makslevental / gist:7e6fa722cea3f726626dc28e59804f57

Created April 1, 2025 18:36

	:3:hip_module.cpp :58 : 16882253059 us: [pid:47943 tid:0x76c25b558740] hipModuleLoadData ( 0x76c23a13d6c8, 0xda8e430 )
	:3:devprogram.cpp :2648: 16882253218 us: [pid:47943 tid:0x76c25b558740] Using Code Object V5.
	:3:hip_module.cpp :59 : 16882352764 us: [pid:47943 tid:0x76c25b558740] hipModuleLoadData: Returned hipSuccess :
	:3:hip_memory.cpp :615 : 16882377455 us: [pid:47943 tid:0x76c25b558740] hipMalloc ( 0x76c241563a18, 4194304 )
	:3:rocdevice.cpp :2418: 16882377746 us: [pid:47943 tid:0x76c25b558740] Device=0xd954ab0, freeMem_ = 0xb3ec00000
	:3:hip_memory.cpp :617 : 16882377752 us: [pid:47943 tid:0x76c25b558740] hipMalloc: Returned hipSuccess : 0x76c07ba00000: duration: 297 us
	:3:hip_memory.cpp :615 : 16882377788 us: [pid:47943 tid:0x76c25b558740] hipMalloc ( 0x76c241563ac8, 4194304 )
	:3:rocdevice.cpp :2418: 16882377851 us: [pid:47943 tid:0x76c25b558740] Device=0xd954ab0, freeMem_ = 0xb3e800000
	:3:hip_memory.cpp

Maksim Levental makslevental