Skip to content

Instantly share code, notes, and snippets.

View makslevental's full-sized avatar
💩

Maksim Levental makslevental

💩
View GitHub Profile
W20250414 14:53:24.356304 132475353552832 metadata.cpp:186] rocprofiler_iterate_agent_supported_counters returned ROCPROFILER_STATUS_ERROR_AGENT_ARCH_NOT_SUPPORTED for agent 1 (gfx1150) :: Agent HW architecture is not supported, no counter metrics found.
W20250414 14:53:24.651769 131105783740352 metadata.cpp:186] rocprofiler_iterate_agent_supported_counters returned ROCPROFILER_STATUS_ERROR_AGENT_ARCH_NOT_SUPPORTED for agent 1 (gfx1150) :: Agent HW architecture is not supported, no counter metrics found.
F20250414 14:53:24.970933 131105783740352 agent.cpp:1069] rocprofiler was only able to map 2 rocprofiler agents to HSA agents, expected 3
@ 0x773d72612360 (unknown)
@ 0x773d72613297 (unknown)
@ 0x773d7205d18a (unknown)
@ 0x773d72098ed5 rocprofiler_set_api_table
@ 0x773d71ad5afc rocprofiler_register_library_api_table
@ 0x773d660a2229 (unknown)
@ 0x773d660a3c58 (unknown)
Dispatch_ID GPU_ID Queue_ID PID TID Grid_Size Workgroup_Size LDS_Per_Workgroup Scratch_Per_Workitem Arch_VGPR Accum_VGPR SGPR Wave_Size Kernel_Name Start_Timestamp End_Timestamp Correlation_ID
0 1 1 31938 31938 32 32 0 0 40 0 128 32 smol_matmul.kd 5382059929644 5382059931804 0
# RUN: python3 %s | FileCheck %s
import triton
import triton.language as tl
from triton.backends.compiler import GPUTarget
def print_test_name_and_run(f):
print(f"Test: {f.__name__}")
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
tt.func public @matmul_kernel_persistent(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #ttg.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 4], instrShape = [32, 32], isTransposed = true}>>
%true = arith.constant true
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x256xf16, #ttg.blocked<{sizePerThread = [8, 1], thre
import re
import subprocess
import sys
from black.trans import defaultdict
test_file = "/home/mlevental/dev_projects/triton/test/TritonGPU/amd/amd-range-analysis.mlir"
cmnd = [
"/home/mlevental/dev_projects/llvm-project/cmake-build-debug/bin/triton-opt",
test.mlir:4:13: remark: unsigned : [1, 1] signed : [-1, -1]
%true = arith.constant true
^
test.mlir:6:14: remark: unsigned : [0, 0] signed : [0, 0]
%cst_1 = arith.constant dense<0> : tensor<256xi32, #ttg.slice<{dim = 0, parent = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 8], order = [0, 1]}>}>>
^
test.mlir:6:14: remark: non-neg
%cst_1 = arith.constant dense<0> : tensor<256xi32, #ttg.slice<{dim = 0, parent = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 8], order = [0, 1]}>}>>
^
test.mlir:8:15: remark: unsigned : [1, 1] signed : [1, 1]
import triton
import triton.language as tl
@triton.jit
def matmul_kernel(
a_ptr,
b_ptr,
c_ptr,
bias_ptr,
import triton
import triton.language as tl
@triton.jit
def matmul_kernel(a_ptr, b_ptr, c_ptr, bias_ptr, M, N, K, stride_am, stride_ak, stride_bk, stride_bn, stride_cm,
stride_cn, stride_bias, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
BLOCK_SIZE_K: tl.constexpr, SPLIT_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, BIAS: tl.constexpr,
EVEN_K: tl.constexpr, GRID_MN: tl.constexpr, NUM_XCDS: tl.constexpr):
#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
#blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
#mma = #ttg.amd_wmma<{version = 1, isTranspose = false, warpsPerCTA = [2, 2]}>
#shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [1, 0]}>
#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx1100", "ttg.threads-per-warp" = 32 : i32} {
tt.func public @matmul_kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility =
:3:hip_module.cpp :58 : 16882253059 us: [pid:47943 tid:0x76c25b558740] hipModuleLoadData ( 0x76c23a13d6c8, 0xda8e430 )
:3:devprogram.cpp :2648: 16882253218 us: [pid:47943 tid:0x76c25b558740] Using Code Object V5.
:3:hip_module.cpp :59 : 16882352764 us: [pid:47943 tid:0x76c25b558740] hipModuleLoadData: Returned hipSuccess :
:3:hip_memory.cpp :615 : 16882377455 us: [pid:47943 tid:0x76c25b558740] hipMalloc ( 0x76c241563a18, 4194304 )
:3:rocdevice.cpp :2418: 16882377746 us: [pid:47943 tid:0x76c25b558740] Device=0xd954ab0, freeMem_ = 0xb3ec00000
:3:hip_memory.cpp :617 : 16882377752 us: [pid:47943 tid:0x76c25b558740] hipMalloc: Returned hipSuccess : 0x76c07ba00000: duration: 297 us
:3:hip_memory.cpp :615 : 16882377788 us: [pid:47943 tid:0x76c25b558740] hipMalloc ( 0x76c241563ac8, 4194304 )
:3:rocdevice.cpp :2418: 16882377851 us: [pid:47943 tid:0x76c25b558740] Device=0xd954ab0, freeMem_ = 0xb3e800000
:3:hip_memory.cpp