Skip to content

Instantly share code, notes, and snippets.

@antiagainst
Last active August 2, 2024 01:46
Show Gist options
  • Save antiagainst/e2d3d74b9f10d1d6b27c40badba0f7aa to your computer and use it in GitHub Desktop.
Save antiagainst/e2d3d74b9f10d1d6b27c40badba0f7aa to your computer and use it in GitHub Desktop.
matvec in triton
// -----// IR Dump Before ConvertTritonToTritonGPU (convert-triton-to-tritongpu) ('builtin.module' operation) //----- //
#loc = loc(unknown)
module {
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} {
%c2_i32 = arith.constant 2 : i32 loc(#loc)
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32> loc(#loc)
%c1_i32 = arith.constant 1 : i32 loc(#loc)
%c0_i32 = arith.constant 0 : i32 loc(#loc)
%cst_0 = arith.constant dense<1024> : tensor<4x1xi32> loc(#loc)
%cst_1 = arith.constant dense<2048> : tensor<16x2048xi32> loc(#loc)
%cst_2 = arith.constant dense<2048> : tensor<4x2048xi32> loc(#loc)
%cst_3 = arith.constant dense<4096> : tensor<16x1xi32> loc(#loc)
%cst_4 = arith.constant dense<4096> : tensor<4x1xi32> loc(#loc)
%c16_i32 = arith.constant 16 : i32 loc(#loc)
%c4_i32 = arith.constant 4 : i32 loc(#loc)
%0 = tt.get_program_id x : i32 loc(#loc)
%1 = tt.get_program_id y : i32 loc(#loc)
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc)
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc)
%4 = tt.splat %2 : i32 -> tensor<4xi32> loc(#loc)
%5 = arith.addi %4, %3 : tensor<4xi32> loc(#loc)
%6 = arith.muli %1, %c16_i32 : i32 loc(#loc)
%7 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc)
%8 = tt.splat %6 : i32 -> tensor<16xi32> loc(#loc)
%9 = arith.addi %8, %7 : tensor<16xi32> loc(#loc)
%10 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc)
%11 = tt.expand_dims %5 {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc)
%12 = arith.muli %11, %cst_4 : tensor<4x1xi32> loc(#loc)
%13 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>> loc(#loc)
%14 = tt.addptr %13, %12 : tensor<4x1x!tt.ptr<f16>>, tensor<4x1xi32> loc(#loc)
%15 = tt.expand_dims %10 {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc)
%16 = tt.broadcast %14 : tensor<4x1x!tt.ptr<f16>> -> tensor<4x2048x!tt.ptr<f16>> loc(#loc)
%17 = tt.broadcast %15 : tensor<1x2048xi32> -> tensor<4x2048xi32> loc(#loc)
%18 = tt.addptr %16, %17 : tensor<4x2048x!tt.ptr<f16>>, tensor<4x2048xi32> loc(#loc)
%19 = tt.expand_dims %9 {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc)
%20 = arith.muli %19, %cst_3 : tensor<16x1xi32> loc(#loc)
%21 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>> loc(#loc)
%22 = tt.addptr %21, %20 : tensor<16x1x!tt.ptr<f16>>, tensor<16x1xi32> loc(#loc)
%23 = tt.broadcast %22 : tensor<16x1x!tt.ptr<f16>> -> tensor<16x2048x!tt.ptr<f16>> loc(#loc)
%24 = tt.broadcast %15 : tensor<1x2048xi32> -> tensor<16x2048xi32> loc(#loc)
%25 = tt.addptr %23, %24 : tensor<16x2048x!tt.ptr<f16>>, tensor<16x2048xi32> loc(#loc)
%26:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %18, %arg6 = %25) -> (tensor<4x16xf32>, tensor<4x2048x!tt.ptr<f16>>, tensor<16x2048x!tt.ptr<f16>>) : i32 {
%35 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>> loc(#loc)
%36 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>> loc(#loc)
%37 = tt.reshape %35 {allow_reorder = false} : tensor<4x2048xf16> -> tensor<4x1x2048xf16> loc(#loc)
%38 = tt.reshape %36 {allow_reorder = false} : tensor<16x2048xf16> -> tensor<1x16x2048xf16> loc(#loc)
%39 = arith.extf %37 : tensor<4x1x2048xf16> to tensor<4x1x2048xf32> loc(#loc)
%40 = tt.broadcast %39 : tensor<4x1x2048xf32> -> tensor<4x16x2048xf32> loc(#loc)
%41 = arith.extf %38 : tensor<1x16x2048xf16> to tensor<1x16x2048xf32> loc(#loc)
%42 = tt.broadcast %41 : tensor<1x16x2048xf32> -> tensor<4x16x2048xf32> loc(#loc)
%43 = arith.mulf %40, %42 : tensor<4x16x2048xf32> loc(#loc)
%44 = "tt.reduce"(%43) <{axis = 2 : i32}> ({
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
%48 = arith.addf %arg7, %arg8 : f32 loc(#loc)
tt.reduce.return %48 : f32 loc(#loc)
}) : (tensor<4x16x2048xf32>) -> tensor<4x16xf32> loc(#loc)
%45 = arith.addf %arg4, %44 : tensor<4x16xf32> loc(#loc)
%46 = tt.addptr %arg5, %cst_2 : tensor<4x2048x!tt.ptr<f16>>, tensor<4x2048xi32> loc(#loc)
%47 = tt.addptr %arg6, %cst_1 : tensor<16x2048x!tt.ptr<f16>>, tensor<16x2048xi32> loc(#loc)
scf.yield %45, %46, %47 : tensor<4x16xf32>, tensor<4x2048x!tt.ptr<f16>>, tensor<16x2048x!tt.ptr<f16>> loc(#loc)
} loc(#loc)
%27 = arith.muli %11, %cst_0 : tensor<4x1xi32> loc(#loc)
%28 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>> loc(#loc)
%29 = tt.addptr %28, %27 : tensor<4x1x!tt.ptr<f16>>, tensor<4x1xi32> loc(#loc)
%30 = tt.expand_dims %9 {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc)
%31 = tt.broadcast %29 : tensor<4x1x!tt.ptr<f16>> -> tensor<4x16x!tt.ptr<f16>> loc(#loc)
%32 = tt.broadcast %30 : tensor<1x16xi32> -> tensor<4x16xi32> loc(#loc)
%33 = tt.addptr %31, %32 : tensor<4x16x!tt.ptr<f16>>, tensor<4x16xi32> loc(#loc)
%34 = arith.truncf %26#0 : tensor<4x16xf32> to tensor<4x16xf16> loc(#loc)
tt.store %33, %34 : tensor<4x16x!tt.ptr<f16>> loc(#loc)
tt.return loc(#loc)
} loc(#loc)
} loc(#loc)
// -----// IR Dump Before TritonGPUCoalesce (tritongpu-coalesce) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [16, 1], order = [1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [16, 1], order = [1, 0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 16], order = [1, 0]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [16], order = [0]}>
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [16, 1], order = [0, 1]}>
#blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 16], order = [0, 1]}>
#blocked6 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 1, 16], order = [2, 1, 0]}>
#loc = loc(unknown)
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} {
%c2_i32 = arith.constant 2 : i32 loc(#loc)
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #blocked> loc(#loc)
%c1_i32 = arith.constant 1 : i32 loc(#loc)
%c0_i32 = arith.constant 0 : i32 loc(#loc)
%cst_0 = arith.constant dense<1024> : tensor<4x1xi32, #blocked1> loc(#loc)
%cst_1 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked2> loc(#loc)
%cst_2 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked2> loc(#loc)
%cst_3 = arith.constant dense<4096> : tensor<16x1xi32, #blocked1> loc(#loc)
%cst_4 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc)
%c16_i32 = arith.constant 16 : i32 loc(#loc)
%c4_i32 = arith.constant 4 : i32 loc(#loc)
%0 = tt.get_program_id x : i32 loc(#loc)
%1 = tt.get_program_id y : i32 loc(#loc)
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc)
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #blocked3> loc(#loc)
%4 = tt.splat %2 : i32 -> tensor<4xi32, #blocked3> loc(#loc)
%5 = arith.addi %4, %3 : tensor<4xi32, #blocked3> loc(#loc)
%6 = arith.muli %1, %c16_i32 : i32 loc(#loc)
%7 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #blocked3> loc(#loc)
%8 = tt.splat %6 : i32 -> tensor<16xi32, #blocked3> loc(#loc)
%9 = arith.addi %8, %7 : tensor<16xi32, #blocked3> loc(#loc)
%10 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #blocked3> loc(#loc)
%11 = triton_gpu.convert_layout %5 : tensor<4xi32, #blocked3> -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc)
%12 = tt.expand_dims %11 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<4x1xi32, #blocked4> loc(#loc)
%13 = triton_gpu.convert_layout %12 : tensor<4x1xi32, #blocked4> -> tensor<4x1xi32, #blocked1> loc(#loc)
%14 = arith.muli %13, %cst_4 : tensor<4x1xi32, #blocked1> loc(#loc)
%15 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%16 = tt.addptr %15, %14 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc)
%17 = triton_gpu.convert_layout %10 : tensor<2048xi32, #blocked3> -> tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc)
%18 = tt.expand_dims %17 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x2048xi32, #blocked5> loc(#loc)
%19 = triton_gpu.convert_layout %18 : tensor<1x2048xi32, #blocked5> -> tensor<1x2048xi32, #blocked2> loc(#loc)
%20 = tt.broadcast %16 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%21 = triton_gpu.convert_layout %20 : tensor<4x2048x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
%22 = tt.broadcast %19 : tensor<1x2048xi32, #blocked2> -> tensor<4x2048xi32, #blocked2> loc(#loc)
%23 = tt.addptr %21, %22 : tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<4x2048xi32, #blocked2> loc(#loc)
%24 = triton_gpu.convert_layout %9 : tensor<16xi32, #blocked3> -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc)
%25 = tt.expand_dims %24 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<16x1xi32, #blocked4> loc(#loc)
%26 = triton_gpu.convert_layout %25 : tensor<16x1xi32, #blocked4> -> tensor<16x1xi32, #blocked1> loc(#loc)
%27 = arith.muli %26, %cst_3 : tensor<16x1xi32, #blocked1> loc(#loc)
%28 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%29 = tt.addptr %28, %27 : tensor<16x1x!tt.ptr<f16>, #blocked1>, tensor<16x1xi32, #blocked1> loc(#loc)
%30 = tt.broadcast %29 : tensor<16x1x!tt.ptr<f16>, #blocked1> -> tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%31 = triton_gpu.convert_layout %30 : tensor<16x2048x!tt.ptr<f16>, #blocked1> -> tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
%32 = tt.broadcast %19 : tensor<1x2048xi32, #blocked2> -> tensor<16x2048xi32, #blocked2> loc(#loc)
%33 = tt.addptr %31, %32 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc)
%34:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %23, %arg6 = %33) -> (tensor<4x16xf32, #blocked>, tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048x!tt.ptr<f16>, #blocked2>) : i32 {
%46 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
%47 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
%48 = tt.reshape %46 {allow_reorder = false} : tensor<4x2048xf16, #blocked2> -> tensor<4x1x2048xf16, #blocked6> loc(#loc)
%49 = tt.reshape %47 {allow_reorder = false} : tensor<16x2048xf16, #blocked2> -> tensor<1x16x2048xf16, #blocked6> loc(#loc)
%50 = arith.extf %48 : tensor<4x1x2048xf16, #blocked6> to tensor<4x1x2048xf32, #blocked6> loc(#loc)
%51 = tt.broadcast %50 : tensor<4x1x2048xf32, #blocked6> -> tensor<4x16x2048xf32, #blocked6> loc(#loc)
%52 = arith.extf %49 : tensor<1x16x2048xf16, #blocked6> to tensor<1x16x2048xf32, #blocked6> loc(#loc)
%53 = tt.broadcast %52 : tensor<1x16x2048xf32, #blocked6> -> tensor<4x16x2048xf32, #blocked6> loc(#loc)
%54 = arith.mulf %51, %53 : tensor<4x16x2048xf32, #blocked6> loc(#loc)
%55 = "tt.reduce"(%54) <{axis = 2 : i32}> ({
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
%60 = arith.addf %arg7, %arg8 : f32 loc(#loc)
tt.reduce.return %60 : f32 loc(#loc)
}) : (tensor<4x16x2048xf32, #blocked6>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked6}>> loc(#loc)
%56 = triton_gpu.convert_layout %55 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked6}>> -> tensor<4x16xf32, #blocked> loc(#loc)
%57 = arith.addf %arg4, %56 : tensor<4x16xf32, #blocked> loc(#loc)
%58 = tt.addptr %arg5, %cst_2 : tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<4x2048xi32, #blocked2> loc(#loc)
%59 = tt.addptr %arg6, %cst_1 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc)
scf.yield %57, %58, %59 : tensor<4x16xf32, #blocked>, tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
} loc(#loc)
%35 = arith.muli %13, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc)
%36 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%37 = tt.addptr %36, %35 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc)
%38 = triton_gpu.convert_layout %9 : tensor<16xi32, #blocked3> -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc)
%39 = tt.expand_dims %38 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x16xi32, #blocked5> loc(#loc)
%40 = triton_gpu.convert_layout %39 : tensor<1x16xi32, #blocked5> -> tensor<1x16xi32, #blocked> loc(#loc)
%41 = tt.broadcast %37 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x16x!tt.ptr<f16>, #blocked1> loc(#loc)
%42 = triton_gpu.convert_layout %41 : tensor<4x16x!tt.ptr<f16>, #blocked1> -> tensor<4x16x!tt.ptr<f16>, #blocked> loc(#loc)
%43 = tt.broadcast %40 : tensor<1x16xi32, #blocked> -> tensor<4x16xi32, #blocked> loc(#loc)
%44 = tt.addptr %42, %43 : tensor<4x16x!tt.ptr<f16>, #blocked>, tensor<4x16xi32, #blocked> loc(#loc)
%45 = arith.truncf %34#0 : tensor<4x16xf32, #blocked> to tensor<4x16xf16, #blocked> loc(#loc)
tt.store %44, %45 : tensor<4x16x!tt.ptr<f16>, #blocked> loc(#loc)
tt.return loc(#loc)
} loc(#loc)
} loc(#loc)
// -----// IR Dump Before TritonGPURemoveLayoutConversions (tritongpu-remove-layout-conversions) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [16, 1], order = [1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [16, 1], order = [1, 0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 16], order = [1, 0]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [16], order = [0]}>
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [16, 1], order = [0, 1]}>
#blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 16], order = [0, 1]}>
#blocked6 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}>
#blocked7 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 1, 16], order = [2, 1, 0]}>
#loc = loc(unknown)
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} {
%c2_i32 = arith.constant 2 : i32 loc(#loc)
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #blocked> loc(#loc)
%c1_i32 = arith.constant 1 : i32 loc(#loc)
%c0_i32 = arith.constant 0 : i32 loc(#loc)
%cst_0 = arith.constant dense<1024> : tensor<4x1xi32, #blocked1> loc(#loc)
%cst_1 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked2> loc(#loc)
%cst_2 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked2> loc(#loc)
%cst_3 = arith.constant dense<4096> : tensor<16x1xi32, #blocked1> loc(#loc)
%cst_4 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc)
%c16_i32 = arith.constant 16 : i32 loc(#loc)
%c4_i32 = arith.constant 4 : i32 loc(#loc)
%0 = tt.get_program_id x : i32 loc(#loc)
%1 = tt.get_program_id y : i32 loc(#loc)
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc)
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #blocked3> loc(#loc)
%4 = tt.splat %2 : i32 -> tensor<4xi32, #blocked3> loc(#loc)
%5 = arith.addi %4, %3 : tensor<4xi32, #blocked3> loc(#loc)
%6 = arith.muli %1, %c16_i32 : i32 loc(#loc)
%7 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #blocked3> loc(#loc)
%8 = tt.splat %6 : i32 -> tensor<16xi32, #blocked3> loc(#loc)
%9 = arith.addi %8, %7 : tensor<16xi32, #blocked3> loc(#loc)
%10 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #blocked3> loc(#loc)
%11 = triton_gpu.convert_layout %5 : tensor<4xi32, #blocked3> -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc)
%12 = tt.expand_dims %11 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<4x1xi32, #blocked4> loc(#loc)
%13 = triton_gpu.convert_layout %12 : tensor<4x1xi32, #blocked4> -> tensor<4x1xi32, #blocked1> loc(#loc)
%14 = arith.muli %13, %cst_4 : tensor<4x1xi32, #blocked1> loc(#loc)
%15 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%16 = tt.addptr %15, %14 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc)
%17 = triton_gpu.convert_layout %10 : tensor<2048xi32, #blocked3> -> tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc)
%18 = tt.expand_dims %17 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x2048xi32, #blocked5> loc(#loc)
%19 = triton_gpu.convert_layout %18 : tensor<1x2048xi32, #blocked5> -> tensor<1x2048xi32, #blocked2> loc(#loc)
%20 = tt.broadcast %16 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%21 = triton_gpu.convert_layout %20 : tensor<4x2048x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
%22 = tt.broadcast %19 : tensor<1x2048xi32, #blocked2> -> tensor<4x2048xi32, #blocked2> loc(#loc)
%23 = tt.addptr %21, %22 : tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<4x2048xi32, #blocked2> loc(#loc)
%24 = triton_gpu.convert_layout %9 : tensor<16xi32, #blocked3> -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc)
%25 = tt.expand_dims %24 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<16x1xi32, #blocked4> loc(#loc)
%26 = triton_gpu.convert_layout %25 : tensor<16x1xi32, #blocked4> -> tensor<16x1xi32, #blocked1> loc(#loc)
%27 = arith.muli %26, %cst_3 : tensor<16x1xi32, #blocked1> loc(#loc)
%28 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%29 = tt.addptr %28, %27 : tensor<16x1x!tt.ptr<f16>, #blocked1>, tensor<16x1xi32, #blocked1> loc(#loc)
%30 = tt.broadcast %29 : tensor<16x1x!tt.ptr<f16>, #blocked1> -> tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%31 = triton_gpu.convert_layout %30 : tensor<16x2048x!tt.ptr<f16>, #blocked1> -> tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
%32 = tt.broadcast %19 : tensor<1x2048xi32, #blocked2> -> tensor<16x2048xi32, #blocked2> loc(#loc)
%33 = tt.addptr %31, %32 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc)
%34:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %23, %arg6 = %33) -> (tensor<4x16xf32, #blocked>, tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048x!tt.ptr<f16>, #blocked2>) : i32 {
%48 = triton_gpu.convert_layout %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked2> -> tensor<4x2048x!tt.ptr<f16>, #blocked6> loc(#loc)
%49 = tt.load %48 : tensor<4x2048x!tt.ptr<f16>, #blocked6> loc(#loc)
%50 = triton_gpu.convert_layout %49 : tensor<4x2048xf16, #blocked6> -> tensor<4x2048xf16, #blocked2> loc(#loc)
%51 = triton_gpu.convert_layout %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked2> -> tensor<16x2048x!tt.ptr<f16>, #blocked6> loc(#loc)
%52 = tt.load %51 : tensor<16x2048x!tt.ptr<f16>, #blocked6> loc(#loc)
%53 = triton_gpu.convert_layout %52 : tensor<16x2048xf16, #blocked6> -> tensor<16x2048xf16, #blocked2> loc(#loc)
%54 = tt.reshape %50 {allow_reorder = false} : tensor<4x2048xf16, #blocked2> -> tensor<4x1x2048xf16, #blocked7> loc(#loc)
%55 = tt.reshape %53 {allow_reorder = false} : tensor<16x2048xf16, #blocked2> -> tensor<1x16x2048xf16, #blocked7> loc(#loc)
%56 = arith.extf %54 : tensor<4x1x2048xf16, #blocked7> to tensor<4x1x2048xf32, #blocked7> loc(#loc)
%57 = tt.broadcast %56 : tensor<4x1x2048xf32, #blocked7> -> tensor<4x16x2048xf32, #blocked7> loc(#loc)
%58 = arith.extf %55 : tensor<1x16x2048xf16, #blocked7> to tensor<1x16x2048xf32, #blocked7> loc(#loc)
%59 = tt.broadcast %58 : tensor<1x16x2048xf32, #blocked7> -> tensor<4x16x2048xf32, #blocked7> loc(#loc)
%60 = arith.mulf %57, %59 : tensor<4x16x2048xf32, #blocked7> loc(#loc)
%61 = "tt.reduce"(%60) <{axis = 2 : i32}> ({
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
%66 = arith.addf %arg7, %arg8 : f32 loc(#loc)
tt.reduce.return %66 : f32 loc(#loc)
}) : (tensor<4x16x2048xf32, #blocked7>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked7}>> loc(#loc)
%62 = triton_gpu.convert_layout %61 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked7}>> -> tensor<4x16xf32, #blocked> loc(#loc)
%63 = arith.addf %arg4, %62 : tensor<4x16xf32, #blocked> loc(#loc)
%64 = tt.addptr %arg5, %cst_2 : tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<4x2048xi32, #blocked2> loc(#loc)
%65 = tt.addptr %arg6, %cst_1 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc)
scf.yield %63, %64, %65 : tensor<4x16xf32, #blocked>, tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
} loc(#loc)
%35 = arith.muli %13, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc)
%36 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%37 = tt.addptr %36, %35 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc)
%38 = triton_gpu.convert_layout %9 : tensor<16xi32, #blocked3> -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc)
%39 = tt.expand_dims %38 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x16xi32, #blocked5> loc(#loc)
%40 = triton_gpu.convert_layout %39 : tensor<1x16xi32, #blocked5> -> tensor<1x16xi32, #blocked> loc(#loc)
%41 = tt.broadcast %37 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x16x!tt.ptr<f16>, #blocked1> loc(#loc)
%42 = triton_gpu.convert_layout %41 : tensor<4x16x!tt.ptr<f16>, #blocked1> -> tensor<4x16x!tt.ptr<f16>, #blocked> loc(#loc)
%43 = tt.broadcast %40 : tensor<1x16xi32, #blocked> -> tensor<4x16xi32, #blocked> loc(#loc)
%44 = tt.addptr %42, %43 : tensor<4x16x!tt.ptr<f16>, #blocked>, tensor<4x16xi32, #blocked> loc(#loc)
%45 = arith.truncf %34#0 : tensor<4x16xf32, #blocked> to tensor<4x16xf16, #blocked> loc(#loc)
%46 = triton_gpu.convert_layout %44 : tensor<4x16x!tt.ptr<f16>, #blocked> -> tensor<4x16x!tt.ptr<f16>, #blocked> loc(#loc)
%47 = triton_gpu.convert_layout %45 : tensor<4x16xf16, #blocked> -> tensor<4x16xf16, #blocked> loc(#loc)
tt.store %46, %47 : tensor<4x16x!tt.ptr<f16>, #blocked> loc(#loc)
tt.return loc(#loc)
} loc(#loc)
} loc(#loc)
// -----// IR Dump Before TritonGPUOptimizeThreadLocality (tritongpu-optimize-thread-locality) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 4, 4], order = [2, 1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [16, 1], order = [1, 0]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [4, 1, 4], order = [2, 1, 0]}>
#loc = loc(unknown)
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%c2_i32 = arith.constant 2 : i32 loc(#loc)
%c1_i32 = arith.constant 1 : i32 loc(#loc)
%c0_i32 = arith.constant 0 : i32 loc(#loc)
%c4_i32 = arith.constant 4 : i32 loc(#loc)
%c16_i32 = arith.constant 16 : i32 loc(#loc)
%cst_0 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc)
%cst_1 = arith.constant dense<4096> : tensor<16x1xi32, #blocked1> loc(#loc)
%cst_2 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked1> loc(#loc)
%cst_3 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked1> loc(#loc)
%cst_4 = arith.constant dense<1024> : tensor<4x1xi32, #blocked2> loc(#loc)
%0 = tt.get_program_id x : i32 loc(#loc)
%1 = tt.get_program_id y : i32 loc(#loc)
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc)
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%4 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%5 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%6 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%7 = arith.addi %5, %3 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%8 = arith.addi %6, %4 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%9 = arith.muli %1, %c16_i32 : i32 loc(#loc)
%10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc)
%12 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%13 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc)
%14 = arith.addi %12, %10 : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%15 = arith.addi %13, %11 : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc)
%16 = tt.expand_dims %7 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc)
%17 = tt.expand_dims %8 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<4x1xi32, #blocked2> loc(#loc)
%18 = arith.muli %16, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc)
%19 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%20 = tt.addptr %19, %18 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc)
%21 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> loc(#loc)
%22 = tt.expand_dims %21 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc)
%23 = tt.broadcast %20 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%24 = tt.broadcast %22 : tensor<1x2048xi32, #blocked1> -> tensor<4x2048xi32, #blocked1> loc(#loc)
%25 = tt.addptr %23, %24 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc)
%26 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<16x1xi32, #blocked1> loc(#loc)
%27 = arith.muli %26, %cst_1 : tensor<16x1xi32, #blocked1> loc(#loc)
%28 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%29 = tt.addptr %28, %27 : tensor<16x1x!tt.ptr<f16>, #blocked1>, tensor<16x1xi32, #blocked1> loc(#loc)
%30 = tt.broadcast %29 : tensor<16x1x!tt.ptr<f16>, #blocked1> -> tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%31 = tt.broadcast %22 : tensor<1x2048xi32, #blocked1> -> tensor<16x2048xi32, #blocked1> loc(#loc)
%32 = tt.addptr %30, %31 : tensor<16x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048xi32, #blocked1> loc(#loc)
%33:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %25, %arg6 = %32) -> (tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked1>) : i32 {
%43 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%44 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%45 = tt.reshape %43 {allow_reorder = false} : tensor<4x2048xf16, #blocked1> -> tensor<4x1x2048xf16, #blocked3> loc(#loc)
%46 = tt.reshape %44 {allow_reorder = false} : tensor<16x2048xf16, #blocked1> -> tensor<1x16x2048xf16, #blocked> loc(#loc)
%47 = arith.extf %45 : tensor<4x1x2048xf16, #blocked3> to tensor<4x1x2048xf32, #blocked3> loc(#loc)
%48 = triton_gpu.convert_layout %47 : tensor<4x1x2048xf32, #blocked3> -> tensor<4x1x2048xf32, #blocked> loc(#loc)
%49 = tt.broadcast %48 : tensor<4x1x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc)
%50 = arith.extf %46 : tensor<1x16x2048xf16, #blocked> to tensor<1x16x2048xf32, #blocked> loc(#loc)
%51 = tt.broadcast %50 : tensor<1x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc)
%52 = arith.mulf %49, %51 : tensor<4x16x2048xf32, #blocked> loc(#loc)
%53 = "tt.reduce"(%52) <{axis = 2 : i32}> ({
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
%57 = arith.addf %arg7, %arg8 : f32 loc(#loc)
tt.reduce.return %57 : f32 loc(#loc)
}) : (tensor<4x16x2048xf32, #blocked>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%54 = arith.addf %arg4, %53 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%55 = tt.addptr %arg5, %cst_2 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc)
%56 = tt.addptr %arg6, %cst_3 : tensor<16x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048xi32, #blocked1> loc(#loc)
scf.yield %54, %55, %56 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
} loc(#loc)
%34 = arith.muli %17, %cst_4 : tensor<4x1xi32, #blocked2> loc(#loc)
%35 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked2> loc(#loc)
%36 = tt.addptr %35, %34 : tensor<4x1x!tt.ptr<f16>, #blocked2>, tensor<4x1xi32, #blocked2> loc(#loc)
%37 = tt.expand_dims %15 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x16xi32, #blocked2> loc(#loc)
%38 = tt.broadcast %36 : tensor<4x1x!tt.ptr<f16>, #blocked2> -> tensor<4x16x!tt.ptr<f16>, #blocked2> loc(#loc)
%39 = tt.broadcast %37 : tensor<1x16xi32, #blocked2> -> tensor<4x16xi32, #blocked2> loc(#loc)
%40 = tt.addptr %38, %39 : tensor<4x16x!tt.ptr<f16>, #blocked2>, tensor<4x16xi32, #blocked2> loc(#loc)
%41 = arith.truncf %33#0 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> to tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%42 = triton_gpu.convert_layout %41 : tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16xf16, #blocked2> loc(#loc)
tt.store %40, %42 : tensor<4x16x!tt.ptr<f16>, #blocked2> loc(#loc)
tt.return loc(#loc)
} loc(#loc)
} loc(#loc)
// -----// IR Dump Before TritonAMDGPUAccelerateMatmul (tritonamdgpu-accelerate-matmul) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 4, 4], order = [2, 1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [16, 1], order = [1, 0]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [4, 1, 4], order = [2, 1, 0]}>
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 16, 1], order = [2, 1, 0]}>
#loc = loc(unknown)
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%c2_i32 = arith.constant 2 : i32 loc(#loc)
%c1_i32 = arith.constant 1 : i32 loc(#loc)
%c0_i32 = arith.constant 0 : i32 loc(#loc)
%c4_i32 = arith.constant 4 : i32 loc(#loc)
%c16_i32 = arith.constant 16 : i32 loc(#loc)
%cst_0 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc)
%cst_1 = arith.constant dense<4096> : tensor<16x1xi32, #blocked1> loc(#loc)
%cst_2 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked1> loc(#loc)
%cst_3 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked1> loc(#loc)
%cst_4 = arith.constant dense<1024> : tensor<4x1xi32, #blocked2> loc(#loc)
%0 = tt.get_program_id x : i32 loc(#loc)
%1 = tt.get_program_id y : i32 loc(#loc)
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc)
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%4 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%5 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%6 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%7 = arith.addi %5, %3 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%8 = arith.addi %6, %4 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%9 = arith.muli %1, %c16_i32 : i32 loc(#loc)
%10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc)
%12 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%13 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc)
%14 = arith.addi %12, %10 : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%15 = arith.addi %13, %11 : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc)
%16 = tt.expand_dims %7 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc)
%17 = tt.expand_dims %8 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<4x1xi32, #blocked2> loc(#loc)
%18 = arith.muli %16, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc)
%19 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%20 = tt.addptr %19, %18 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc)
%21 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> loc(#loc)
%22 = tt.expand_dims %21 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc)
%23 = tt.broadcast %20 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%24 = tt.broadcast %22 : tensor<1x2048xi32, #blocked1> -> tensor<4x2048xi32, #blocked1> loc(#loc)
%25 = tt.addptr %23, %24 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc)
%26 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<16x1xi32, #blocked1> loc(#loc)
%27 = arith.muli %26, %cst_1 : tensor<16x1xi32, #blocked1> loc(#loc)
%28 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%29 = tt.addptr %28, %27 : tensor<16x1x!tt.ptr<f16>, #blocked1>, tensor<16x1xi32, #blocked1> loc(#loc)
%30 = tt.broadcast %29 : tensor<16x1x!tt.ptr<f16>, #blocked1> -> tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%31 = tt.broadcast %22 : tensor<1x2048xi32, #blocked1> -> tensor<16x2048xi32, #blocked1> loc(#loc)
%32 = tt.addptr %30, %31 : tensor<16x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048xi32, #blocked1> loc(#loc)
%33:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %25, %arg6 = %32) -> (tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked1>) : i32 {
%43 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%44 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%45 = tt.reshape %43 {allow_reorder = false} : tensor<4x2048xf16, #blocked1> -> tensor<4x1x2048xf16, #blocked3> loc(#loc)
%46 = tt.reshape %44 {allow_reorder = false} : tensor<16x2048xf16, #blocked1> -> tensor<1x16x2048xf16, #blocked> loc(#loc)
%47 = arith.extf %45 : tensor<4x1x2048xf16, #blocked3> to tensor<4x1x2048xf32, #blocked3> loc(#loc)
%48 = triton_gpu.convert_layout %47 : tensor<4x1x2048xf32, #blocked3> -> tensor<4x1x2048xf32, #blocked> loc(#loc)
%49 = tt.broadcast %48 : tensor<4x1x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc)
%50 = arith.extf %46 : tensor<1x16x2048xf16, #blocked> to tensor<1x16x2048xf32, #blocked> loc(#loc)
%51 = tt.broadcast %50 : tensor<1x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc)
%52 = arith.mulf %49, %51 : tensor<4x16x2048xf32, #blocked> loc(#loc)
%53 = triton_gpu.convert_layout %52 : tensor<4x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked4> loc(#loc)
%54 = "tt.reduce"(%53) <{axis = 2 : i32}> ({
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
%59 = arith.addf %arg7, %arg8 : f32 loc(#loc)
tt.reduce.return %59 : f32 loc(#loc)
}) {preserve_layout} : (tensor<4x16x2048xf32, #blocked4>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked4}>> loc(#loc)
%55 = triton_gpu.convert_layout %54 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked4}>> -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%56 = arith.addf %arg4, %55 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%57 = tt.addptr %arg5, %cst_2 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc)
%58 = tt.addptr %arg6, %cst_3 : tensor<16x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048xi32, #blocked1> loc(#loc)
scf.yield %56, %57, %58 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
} loc(#loc)
%34 = arith.muli %17, %cst_4 : tensor<4x1xi32, #blocked2> loc(#loc)
%35 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked2> loc(#loc)
%36 = tt.addptr %35, %34 : tensor<4x1x!tt.ptr<f16>, #blocked2>, tensor<4x1xi32, #blocked2> loc(#loc)
%37 = tt.expand_dims %15 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x16xi32, #blocked2> loc(#loc)
%38 = tt.broadcast %36 : tensor<4x1x!tt.ptr<f16>, #blocked2> -> tensor<4x16x!tt.ptr<f16>, #blocked2> loc(#loc)
%39 = tt.broadcast %37 : tensor<1x16xi32, #blocked2> -> tensor<4x16xi32, #blocked2> loc(#loc)
%40 = tt.addptr %38, %39 : tensor<4x16x!tt.ptr<f16>, #blocked2>, tensor<4x16xi32, #blocked2> loc(#loc)
%41 = arith.truncf %33#0 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> to tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%42 = triton_gpu.convert_layout %41 : tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16xf16, #blocked2> loc(#loc)
tt.store %40, %42 : tensor<4x16x!tt.ptr<f16>, #blocked2> loc(#loc)
tt.return loc(#loc)
} loc(#loc)
} loc(#loc)
// -----// IR Dump Before TritonGPURemoveLayoutConversions (tritongpu-remove-layout-conversions) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 4, 4], order = [2, 1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [16, 1], order = [1, 0]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [4, 1, 4], order = [2, 1, 0]}>
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 16, 1], order = [2, 1, 0]}>
#loc = loc(unknown)
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%c2_i32 = arith.constant 2 : i32 loc(#loc)
%c1_i32 = arith.constant 1 : i32 loc(#loc)
%c0_i32 = arith.constant 0 : i32 loc(#loc)
%c4_i32 = arith.constant 4 : i32 loc(#loc)
%c16_i32 = arith.constant 16 : i32 loc(#loc)
%cst_0 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc)
%cst_1 = arith.constant dense<4096> : tensor<16x1xi32, #blocked1> loc(#loc)
%cst_2 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked1> loc(#loc)
%cst_3 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked1> loc(#loc)
%cst_4 = arith.constant dense<1024> : tensor<4x1xi32, #blocked2> loc(#loc)
%0 = tt.get_program_id x : i32 loc(#loc)
%1 = tt.get_program_id y : i32 loc(#loc)
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc)
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%4 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%5 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%6 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%7 = arith.addi %5, %3 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%8 = arith.addi %6, %4 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%9 = arith.muli %1, %c16_i32 : i32 loc(#loc)
%10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc)
%12 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%13 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc)
%14 = arith.addi %12, %10 : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%15 = arith.addi %13, %11 : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc)
%16 = tt.expand_dims %7 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc)
%17 = tt.expand_dims %8 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<4x1xi32, #blocked2> loc(#loc)
%18 = arith.muli %16, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc)
%19 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%20 = tt.addptr %19, %18 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc)
%21 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> loc(#loc)
%22 = tt.expand_dims %21 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc)
%23 = tt.broadcast %20 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%24 = tt.broadcast %22 : tensor<1x2048xi32, #blocked1> -> tensor<4x2048xi32, #blocked1> loc(#loc)
%25 = tt.addptr %23, %24 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc)
%26 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<16x1xi32, #blocked1> loc(#loc)
%27 = arith.muli %26, %cst_1 : tensor<16x1xi32, #blocked1> loc(#loc)
%28 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%29 = tt.addptr %28, %27 : tensor<16x1x!tt.ptr<f16>, #blocked1>, tensor<16x1xi32, #blocked1> loc(#loc)
%30 = tt.broadcast %29 : tensor<16x1x!tt.ptr<f16>, #blocked1> -> tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%31 = tt.broadcast %22 : tensor<1x2048xi32, #blocked1> -> tensor<16x2048xi32, #blocked1> loc(#loc)
%32 = tt.addptr %30, %31 : tensor<16x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048xi32, #blocked1> loc(#loc)
%33:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %25, %arg6 = %32) -> (tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked1>) : i32 {
%43 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%44 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%45 = tt.reshape %43 {allow_reorder = false} : tensor<4x2048xf16, #blocked1> -> tensor<4x1x2048xf16, #blocked3> loc(#loc)
%46 = tt.reshape %44 {allow_reorder = false} : tensor<16x2048xf16, #blocked1> -> tensor<1x16x2048xf16, #blocked> loc(#loc)
%47 = arith.extf %45 : tensor<4x1x2048xf16, #blocked3> to tensor<4x1x2048xf32, #blocked3> loc(#loc)
%48 = triton_gpu.convert_layout %47 : tensor<4x1x2048xf32, #blocked3> -> tensor<4x1x2048xf32, #blocked> loc(#loc)
%49 = tt.broadcast %48 : tensor<4x1x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc)
%50 = arith.extf %46 : tensor<1x16x2048xf16, #blocked> to tensor<1x16x2048xf32, #blocked> loc(#loc)
%51 = tt.broadcast %50 : tensor<1x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc)
%52 = arith.mulf %49, %51 : tensor<4x16x2048xf32, #blocked> loc(#loc)
%53 = triton_gpu.convert_layout %52 : tensor<4x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked4> loc(#loc)
%54 = "tt.reduce"(%53) <{axis = 2 : i32}> ({
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
%59 = arith.addf %arg7, %arg8 : f32 loc(#loc)
tt.reduce.return %59 : f32 loc(#loc)
}) {preserve_layout} : (tensor<4x16x2048xf32, #blocked4>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked4}>> loc(#loc)
%55 = triton_gpu.convert_layout %54 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked4}>> -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%56 = arith.addf %arg4, %55 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%57 = tt.addptr %arg5, %cst_2 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc)
%58 = tt.addptr %arg6, %cst_3 : tensor<16x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048xi32, #blocked1> loc(#loc)
scf.yield %56, %57, %58 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
} loc(#loc)
%34 = arith.muli %17, %cst_4 : tensor<4x1xi32, #blocked2> loc(#loc)
%35 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked2> loc(#loc)
%36 = tt.addptr %35, %34 : tensor<4x1x!tt.ptr<f16>, #blocked2>, tensor<4x1xi32, #blocked2> loc(#loc)
%37 = tt.expand_dims %15 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x16xi32, #blocked2> loc(#loc)
%38 = tt.broadcast %36 : tensor<4x1x!tt.ptr<f16>, #blocked2> -> tensor<4x16x!tt.ptr<f16>, #blocked2> loc(#loc)
%39 = tt.broadcast %37 : tensor<1x16xi32, #blocked2> -> tensor<4x16xi32, #blocked2> loc(#loc)
%40 = tt.addptr %38, %39 : tensor<4x16x!tt.ptr<f16>, #blocked2>, tensor<4x16xi32, #blocked2> loc(#loc)
%41 = arith.truncf %33#0 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> to tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%42 = triton_gpu.convert_layout %41 : tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16xf16, #blocked2> loc(#loc)
tt.store %40, %42 : tensor<4x16x!tt.ptr<f16>, #blocked2> loc(#loc)
tt.return loc(#loc)
} loc(#loc)
} loc(#loc)
// -----// IR Dump Before TritonAMDGPUOptimizeEpilogue (tritonamdgpu-optimize-epilogue) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 16, 1], order = [2, 1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [16, 1], order = [1, 0]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [4, 1, 4], order = [2, 1, 0]}>
#loc = loc(unknown)
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%c2_i32 = arith.constant 2 : i32 loc(#loc)
%c1_i32 = arith.constant 1 : i32 loc(#loc)
%c0_i32 = arith.constant 0 : i32 loc(#loc)
%c4_i32 = arith.constant 4 : i32 loc(#loc)
%c16_i32 = arith.constant 16 : i32 loc(#loc)
%cst_0 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc)
%cst_1 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked1> loc(#loc)
%cst_2 = arith.constant dense<1024> : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%cst_3 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked2> loc(#loc)
%cst_4 = arith.constant dense<4096> : tensor<16x1xi32, #blocked2> loc(#loc)
%0 = tt.get_program_id x : i32 loc(#loc)
%1 = tt.get_program_id y : i32 loc(#loc)
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc)
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%4 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%5 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%6 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%7 = arith.addi %5, %3 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%8 = arith.addi %6, %4 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%9 = arith.muli %1, %c16_i32 : i32 loc(#loc)
%10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%12 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%13 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%14 = arith.addi %12, %10 : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%15 = arith.addi %13, %11 : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%16 = tt.expand_dims %7 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc)
%17 = tt.expand_dims %8 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%18 = arith.muli %16, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc)
%19 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%20 = tt.addptr %19, %18 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc)
%21 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc)
%22 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> loc(#loc)
%23 = tt.expand_dims %21 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x2048xi32, #blocked2> loc(#loc)
%24 = tt.expand_dims %22 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc)
%25 = tt.broadcast %20 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%26 = tt.broadcast %24 : tensor<1x2048xi32, #blocked1> -> tensor<4x2048xi32, #blocked1> loc(#loc)
%27 = tt.addptr %25, %26 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc)
%28 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1xi32, #blocked2> loc(#loc)
%29 = arith.muli %28, %cst_4 : tensor<16x1xi32, #blocked2> loc(#loc)
%30 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked2> loc(#loc)
%31 = tt.addptr %30, %29 : tensor<16x1x!tt.ptr<f16>, #blocked2>, tensor<16x1xi32, #blocked2> loc(#loc)
%32 = tt.broadcast %31 : tensor<16x1x!tt.ptr<f16>, #blocked2> -> tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
%33 = tt.broadcast %23 : tensor<1x2048xi32, #blocked2> -> tensor<16x2048xi32, #blocked2> loc(#loc)
%34 = tt.addptr %32, %33 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc)
%35:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %27, %arg6 = %34) -> (tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2>) : i32 {
%44 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%45 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
%46 = tt.reshape %44 {allow_reorder = false} : tensor<4x2048xf16, #blocked1> -> tensor<4x1x2048xf16, #blocked3> loc(#loc)
%47 = tt.reshape %45 {allow_reorder = false} : tensor<16x2048xf16, #blocked2> -> tensor<1x16x2048xf16, #blocked> loc(#loc)
%48 = triton_gpu.convert_layout %46 : tensor<4x1x2048xf16, #blocked3> -> tensor<4x1x2048xf16, #blocked> loc(#loc)
%49 = arith.extf %48 : tensor<4x1x2048xf16, #blocked> to tensor<4x1x2048xf32, #blocked> loc(#loc)
%50 = tt.broadcast %49 : tensor<4x1x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc)
%51 = arith.extf %47 : tensor<1x16x2048xf16, #blocked> to tensor<1x16x2048xf32, #blocked> loc(#loc)
%52 = tt.broadcast %51 : tensor<1x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc)
%53 = arith.mulf %50, %52 : tensor<4x16x2048xf32, #blocked> loc(#loc)
%54 = "tt.reduce"(%53) <{axis = 2 : i32}> ({
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
%58 = arith.addf %arg7, %arg8 : f32 loc(#loc)
tt.reduce.return %58 : f32 loc(#loc)
}) {preserve_layout} : (tensor<4x16x2048xf32, #blocked>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%55 = arith.addf %arg4, %54 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%56 = tt.addptr %arg5, %cst_1 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc)
%57 = tt.addptr %arg6, %cst_3 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc)
scf.yield %55, %56, %57 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
} loc(#loc)
%36 = arith.muli %17, %cst_2 : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%37 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%38 = tt.addptr %37, %36 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%39 = tt.expand_dims %15 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%40 = tt.broadcast %38 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%41 = tt.broadcast %39 : tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%42 = tt.addptr %40, %41 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%43 = arith.truncf %35#0 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> to tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
tt.store %42, %43 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
tt.return loc(#loc)
} loc(#loc)
} loc(#loc)
// -----// IR Dump Before TritonGPUOptimizeDotOperands (tritongpu-optimize-dot-operands) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 16, 1], order = [2, 1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [16, 1], order = [1, 0]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [4, 1, 4], order = [2, 1, 0]}>
#loc = loc(unknown)
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%c2_i32 = arith.constant 2 : i32 loc(#loc)
%c1_i32 = arith.constant 1 : i32 loc(#loc)
%c0_i32 = arith.constant 0 : i32 loc(#loc)
%c4_i32 = arith.constant 4 : i32 loc(#loc)
%c16_i32 = arith.constant 16 : i32 loc(#loc)
%cst_0 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc)
%cst_1 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked1> loc(#loc)
%cst_2 = arith.constant dense<1024> : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%cst_3 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked2> loc(#loc)
%cst_4 = arith.constant dense<4096> : tensor<16x1xi32, #blocked2> loc(#loc)
%0 = tt.get_program_id x : i32 loc(#loc)
%1 = tt.get_program_id y : i32 loc(#loc)
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc)
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%4 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%5 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%6 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%7 = arith.addi %5, %3 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%8 = arith.addi %6, %4 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%9 = arith.muli %1, %c16_i32 : i32 loc(#loc)
%10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%12 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%13 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%14 = arith.addi %12, %10 : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%15 = arith.addi %13, %11 : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%16 = tt.expand_dims %7 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc)
%17 = tt.expand_dims %8 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%18 = arith.muli %16, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc)
%19 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%20 = tt.addptr %19, %18 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc)
%21 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc)
%22 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> loc(#loc)
%23 = tt.expand_dims %21 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x2048xi32, #blocked2> loc(#loc)
%24 = tt.expand_dims %22 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc)
%25 = tt.broadcast %20 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%26 = tt.broadcast %24 : tensor<1x2048xi32, #blocked1> -> tensor<4x2048xi32, #blocked1> loc(#loc)
%27 = tt.addptr %25, %26 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc)
%28 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1xi32, #blocked2> loc(#loc)
%29 = arith.muli %28, %cst_4 : tensor<16x1xi32, #blocked2> loc(#loc)
%30 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked2> loc(#loc)
%31 = tt.addptr %30, %29 : tensor<16x1x!tt.ptr<f16>, #blocked2>, tensor<16x1xi32, #blocked2> loc(#loc)
%32 = tt.broadcast %31 : tensor<16x1x!tt.ptr<f16>, #blocked2> -> tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
%33 = tt.broadcast %23 : tensor<1x2048xi32, #blocked2> -> tensor<16x2048xi32, #blocked2> loc(#loc)
%34 = tt.addptr %32, %33 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc)
%35:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %27, %arg6 = %34) -> (tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2>) : i32 {
%44 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%45 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
%46 = tt.reshape %44 {allow_reorder = false} : tensor<4x2048xf16, #blocked1> -> tensor<4x1x2048xf16, #blocked3> loc(#loc)
%47 = tt.reshape %45 {allow_reorder = false} : tensor<16x2048xf16, #blocked2> -> tensor<1x16x2048xf16, #blocked> loc(#loc)
%48 = triton_gpu.convert_layout %46 : tensor<4x1x2048xf16, #blocked3> -> tensor<4x1x2048xf16, #blocked> loc(#loc)
%49 = arith.extf %48 : tensor<4x1x2048xf16, #blocked> to tensor<4x1x2048xf32, #blocked> loc(#loc)
%50 = tt.broadcast %49 : tensor<4x1x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc)
%51 = arith.extf %47 : tensor<1x16x2048xf16, #blocked> to tensor<1x16x2048xf32, #blocked> loc(#loc)
%52 = tt.broadcast %51 : tensor<1x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc)
%53 = arith.mulf %50, %52 : tensor<4x16x2048xf32, #blocked> loc(#loc)
%54 = "tt.reduce"(%53) <{axis = 2 : i32}> ({
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
%58 = arith.addf %arg7, %arg8 : f32 loc(#loc)
tt.reduce.return %58 : f32 loc(#loc)
}) {preserve_layout} : (tensor<4x16x2048xf32, #blocked>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%55 = arith.addf %arg4, %54 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%56 = tt.addptr %arg5, %cst_1 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc)
%57 = tt.addptr %arg6, %cst_3 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc)
scf.yield %55, %56, %57 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
} loc(#loc)
%36 = arith.muli %17, %cst_2 : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%37 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%38 = tt.addptr %37, %36 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%39 = tt.expand_dims %15 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%40 = tt.broadcast %38 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%41 = tt.broadcast %39 : tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%42 = tt.addptr %40, %41 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%43 = arith.truncf %35#0 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> to tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
tt.store %42, %43 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
tt.return loc(#loc)
} loc(#loc)
} loc(#loc)
// -----// IR Dump Before TritonAMDGPUStreamPipelineV2 (tritonamdgpu-stream-pipeline-v2) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 16, 1], order = [2, 1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [16, 1], order = [1, 0]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [4, 1, 4], order = [2, 1, 0]}>
#loc = loc(unknown)
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%c2_i32 = arith.constant 2 : i32 loc(#loc)
%c1_i32 = arith.constant 1 : i32 loc(#loc)
%c0_i32 = arith.constant 0 : i32 loc(#loc)
%c4_i32 = arith.constant 4 : i32 loc(#loc)
%c16_i32 = arith.constant 16 : i32 loc(#loc)
%cst_0 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc)
%cst_1 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked1> loc(#loc)
%cst_2 = arith.constant dense<1024> : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%cst_3 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked2> loc(#loc)
%cst_4 = arith.constant dense<4096> : tensor<16x1xi32, #blocked2> loc(#loc)
%0 = tt.get_program_id x : i32 loc(#loc)
%1 = tt.get_program_id y : i32 loc(#loc)
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc)
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%4 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%5 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%6 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%7 = arith.addi %5, %3 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%8 = arith.addi %6, %4 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%9 = arith.muli %1, %c16_i32 : i32 loc(#loc)
%10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%12 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%13 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%14 = arith.addi %12, %10 : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%15 = arith.addi %13, %11 : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%16 = tt.expand_dims %7 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc)
%17 = tt.expand_dims %8 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%18 = arith.muli %16, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc)
%19 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%20 = tt.addptr %19, %18 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc)
%21 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc)
%22 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> loc(#loc)
%23 = tt.expand_dims %21 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x2048xi32, #blocked2> loc(#loc)
%24 = tt.expand_dims %22 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc)
%25 = tt.broadcast %20 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%26 = tt.broadcast %24 : tensor<1x2048xi32, #blocked1> -> tensor<4x2048xi32, #blocked1> loc(#loc)
%27 = tt.addptr %25, %26 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc)
%28 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1xi32, #blocked2> loc(#loc)
%29 = arith.muli %28, %cst_4 : tensor<16x1xi32, #blocked2> loc(#loc)
%30 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked2> loc(#loc)
%31 = tt.addptr %30, %29 : tensor<16x1x!tt.ptr<f16>, #blocked2>, tensor<16x1xi32, #blocked2> loc(#loc)
%32 = tt.broadcast %31 : tensor<16x1x!tt.ptr<f16>, #blocked2> -> tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
%33 = tt.broadcast %23 : tensor<1x2048xi32, #blocked2> -> tensor<16x2048xi32, #blocked2> loc(#loc)
%34 = tt.addptr %32, %33 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc)
%35:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %27, %arg6 = %34) -> (tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2>) : i32 {
%44 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%45 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
%46 = tt.reshape %44 {allow_reorder = false} : tensor<4x2048xf16, #blocked1> -> tensor<4x1x2048xf16, #blocked3> loc(#loc)
%47 = tt.reshape %45 {allow_reorder = false} : tensor<16x2048xf16, #blocked2> -> tensor<1x16x2048xf16, #blocked> loc(#loc)
%48 = triton_gpu.convert_layout %46 : tensor<4x1x2048xf16, #blocked3> -> tensor<4x1x2048xf16, #blocked> loc(#loc)
%49 = arith.extf %48 : tensor<4x1x2048xf16, #blocked> to tensor<4x1x2048xf32, #blocked> loc(#loc)
%50 = tt.broadcast %49 : tensor<4x1x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc)
%51 = arith.extf %47 : tensor<1x16x2048xf16, #blocked> to tensor<1x16x2048xf32, #blocked> loc(#loc)
%52 = tt.broadcast %51 : tensor<1x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc)
%53 = arith.mulf %50, %52 : tensor<4x16x2048xf32, #blocked> loc(#loc)
%54 = "tt.reduce"(%53) <{axis = 2 : i32}> ({
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
%58 = arith.addf %arg7, %arg8 : f32 loc(#loc)
tt.reduce.return %58 : f32 loc(#loc)
}) {preserve_layout} : (tensor<4x16x2048xf32, #blocked>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%55 = arith.addf %arg4, %54 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%56 = tt.addptr %arg5, %cst_1 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc)
%57 = tt.addptr %arg6, %cst_3 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc)
scf.yield %55, %56, %57 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
} loc(#loc)
%36 = arith.muli %17, %cst_2 : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%37 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%38 = tt.addptr %37, %36 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%39 = tt.expand_dims %15 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%40 = tt.broadcast %38 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%41 = tt.broadcast %39 : tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%42 = tt.addptr %40, %41 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%43 = arith.truncf %35#0 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> to tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
tt.store %42, %43 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
tt.return loc(#loc)
} loc(#loc)
} loc(#loc)
// -----// IR Dump Before Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 16, 1], order = [2, 1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [16, 1], order = [1, 0]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [4, 1, 4], order = [2, 1, 0]}>
#loc = loc(unknown)
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%c2_i32 = arith.constant 2 : i32 loc(#loc)
%c1_i32 = arith.constant 1 : i32 loc(#loc)
%c0_i32 = arith.constant 0 : i32 loc(#loc)
%c4_i32 = arith.constant 4 : i32 loc(#loc)
%c16_i32 = arith.constant 16 : i32 loc(#loc)
%cst_0 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc)
%cst_1 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked1> loc(#loc)
%cst_2 = arith.constant dense<1024> : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%cst_3 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked2> loc(#loc)
%cst_4 = arith.constant dense<4096> : tensor<16x1xi32, #blocked2> loc(#loc)
%0 = tt.get_program_id x : i32 loc(#loc)
%1 = tt.get_program_id y : i32 loc(#loc)
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc)
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%4 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%5 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%6 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%7 = arith.addi %5, %3 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc)
%8 = arith.addi %6, %4 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%9 = arith.muli %1, %c16_i32 : i32 loc(#loc)
%10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%12 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%13 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%14 = arith.addi %12, %10 : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc)
%15 = arith.addi %13, %11 : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc)
%16 = tt.expand_dims %7 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc)
%17 = tt.expand_dims %8 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%18 = arith.muli %16, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc)
%19 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc)
%20 = tt.addptr %19, %18 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc)
%21 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc)
%22 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> loc(#loc)
%23 = tt.expand_dims %21 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x2048xi32, #blocked2> loc(#loc)
%24 = tt.expand_dims %22 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc)
%25 = tt.broadcast %20 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%26 = tt.broadcast %24 : tensor<1x2048xi32, #blocked1> -> tensor<4x2048xi32, #blocked1> loc(#loc)
%27 = tt.addptr %25, %26 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc)
%28 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1xi32, #blocked2> loc(#loc)
%29 = arith.muli %28, %cst_4 : tensor<16x1xi32, #blocked2> loc(#loc)
%30 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked2> loc(#loc)
%31 = tt.addptr %30, %29 : tensor<16x1x!tt.ptr<f16>, #blocked2>, tensor<16x1xi32, #blocked2> loc(#loc)
%32 = tt.broadcast %31 : tensor<16x1x!tt.ptr<f16>, #blocked2> -> tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
%33 = tt.broadcast %23 : tensor<1x2048xi32, #blocked2> -> tensor<16x2048xi32, #blocked2> loc(#loc)
%34 = tt.addptr %32, %33 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc)
%c-1_i32 = arith.constant -1 : i32 loc(#loc)
%c0_i32_5 = arith.constant 0 : i32 loc(#loc)
%c1_i32_6 = arith.constant 1 : i32 loc(#loc)
%c1_i32_7 = arith.constant 1 : i32 loc(#loc)
%c0_i32_8 = arith.constant 0 : i32 loc(#loc)
%35 = arith.muli %c1_i32, %c0_i32_8 : i32 loc(#loc)
%36 = arith.addi %c0_i32, %35 : i32 loc(#loc)
%37 = arith.cmpi slt, %36, %c2_i32 : i32 loc(#loc)
%c0_i32_9 = arith.constant 0 : i32 loc(#loc)
%38 = arith.muli %c1_i32, %c0_i32_9 : i32 loc(#loc)
%39 = arith.addi %c0_i32, %38 : i32 loc(#loc)
%40 = tt.splat %37 : i1 -> tensor<4x2048xi1, #blocked1> loc(#loc)
%41 = tt.load %27, %40 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%42 = tt.splat %37 : i1 -> tensor<16x2048xi1, #blocked2> loc(#loc)
%43 = tt.load %34, %42 : tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
%44:6 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %27, %arg6 = %34, %arg7 = %c-1_i32, %arg8 = %41, %arg9 = %43) -> (tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2>, i32, tensor<4x2048xf16, #blocked1>, tensor<16x2048xf16, #blocked2>) : i32 {
%c1_i32_10 = arith.constant 1 : i32 loc(#loc)
%53 = arith.muli %c1_i32, %c1_i32_10 : i32 loc(#loc)
%54 = arith.subi %c2_i32, %53 : i32 loc(#loc)
%55 = arith.cmpi slt, %arg3, %54 : i32 loc(#loc)
%56 = arith.addi %arg7, %c1_i32_6 : i32 loc(#loc)
%57 = arith.cmpi slt, %56, %c1_i32_7 : i32 loc(#loc)
%58 = arith.select %57, %56, %c0_i32_5 : i32 loc(#loc)
%59 = tt.reshape %arg8 {allow_reorder = false} : tensor<4x2048xf16, #blocked1> -> tensor<4x1x2048xf16, #blocked3> loc(#loc)
%60 = tt.reshape %arg9 {allow_reorder = false} : tensor<16x2048xf16, #blocked2> -> tensor<1x16x2048xf16, #blocked> loc(#loc)
%61 = triton_gpu.convert_layout %59 : tensor<4x1x2048xf16, #blocked3> -> tensor<4x1x2048xf16, #blocked> loc(#loc)
%62 = arith.extf %61 : tensor<4x1x2048xf16, #blocked> to tensor<4x1x2048xf32, #blocked> loc(#loc)
%63 = tt.broadcast %62 : tensor<4x1x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc)
%64 = arith.extf %60 : tensor<1x16x2048xf16, #blocked> to tensor<1x16x2048xf32, #blocked> loc(#loc)
%65 = tt.broadcast %64 : tensor<1x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc)
%66 = arith.mulf %63, %65 : tensor<4x16x2048xf32, #blocked> loc(#loc)
%67 = "tt.reduce"(%66) <{axis = 2 : i32}> ({
^bb0(%arg10: f32 loc(unknown), %arg11: f32 loc(unknown)):
%75 = arith.addf %arg10, %arg11 : f32 loc(#loc)
tt.reduce.return %75 : f32 loc(#loc)
}) {preserve_layout} : (tensor<4x16x2048xf32, #blocked>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%68 = arith.addf %arg4, %67 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%69 = tt.addptr %arg5, %cst_1 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc)
%70 = tt.addptr %arg6, %cst_3 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc)
%71 = tt.splat %55 : i1 -> tensor<4x2048xi1, #blocked1> loc(#loc)
%72 = tt.load %69, %71 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc)
%73 = tt.splat %55 : i1 -> tensor<16x2048xi1, #blocked2> loc(#loc)
%74 = tt.load %70, %73 : tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc)
scf.yield %68, %69, %70, %58, %72, %74 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2>, i32, tensor<4x2048xf16, #blocked1>, tensor<16x2048xf16, #blocked2> loc(#loc)
} loc(#loc)
%45 = arith.muli %17, %cst_2 : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%46 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%47 = tt.addptr %46, %45 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%48 = tt.expand_dims %15 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%49 = tt.broadcast %47 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%50 = tt.broadcast %48 : tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%51 = tt.addptr %49, %50 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
%52 = arith.truncf %44#0 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> to tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
tt.store %51, %52 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc)
tt.return loc(#loc)
} loc(#loc)
} loc(#loc)
.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
.amdhsa_code_object_version 4
.globl matvec
.p2align 8
.type matvec,@function
matvec:
s_load_dwordx4 s[8:11], s[0:1], 0x0
s_load_dwordx2 s[12:13], s[0:1], 0x10
v_lshrrev_b32_e32 v1, 6, v0
v_lshl_or_b32 v40, s3, 4, v1
v_lshlrev_b32_e32 v4, 12, v40
v_lshlrev_b32_e32 v1, 3, v0
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v2, s10
v_mov_b32_e32 v3, s11
v_ashrrev_i32_e32 v5, 31, v4
v_and_b32_e32 v6, 0x1f8, v1
v_lshl_add_u64 v[2:3], v[4:5], 1, v[2:3]
v_lshlrev_b32_e32 v42, 1, v6
v_mov_b32_e32 v43, 0
v_lshlrev_b32_e32 v7, 4, v0
v_lshl_add_u64 v[44:45], v[2:3], 0, v[42:43]
v_and_b32_e32 v2, 0x3000, v7
v_lshl_or_b32 v2, s2, 14, v2
v_ashrrev_i32_e32 v3, 31, v2
v_lshlrev_b64 v[2:3], 1, v[2:3]
global_load_dwordx4 v[32:35], v[44:45], off
global_load_dwordx4 v[28:31], v[44:45], off offset:1024
global_load_dwordx4 v[24:27], v[44:45], off offset:2048
global_load_dwordx4 v[16:19], v[44:45], off offset:3072
v_lshl_add_u64 v[4:5], s[8:9], 0, v[2:3]
v_and_b32_e32 v42, 0xff0, v7
v_lshl_add_u64 v[4:5], v[4:5], 0, v[42:43]
global_load_dwordx4 v[36:39], v[4:5], off
v_lshrrev_b32_e32 v4, 5, v0
v_and_b32_e32 v4, 24, v4
v_add_u32_e32 v1, v4, v1
v_lshl_add_u32 v41, v1, 2, 0
v_lshlrev_b32_e32 v1, 2, v0
v_and_b32_e32 v0, 0xff, v0
v_lshl_or_b32 v2, v0, 4, v2
v_xor_b32_e32 v51, 0x80, v1
v_lshl_add_u64 v[0:1], v[2:3], 0, s[8:9]
s_mov_b64 s[0:1], 0x1000
s_mov_b32 s4, 0
v_lshl_add_u32 v50, v6, 2, 0
v_lshl_add_u64 v[48:49], v[0:1], 0, s[0:1]
s_mov_b64 s[14:15], -1
s_mov_b64 s[8:9], 0
v_mov_b32_e32 v42, v43
v_mov_b32_e32 v46, v43
v_mov_b32_e32 v47, v43
s_branch .LBB0_3
.LBB0_1:
v_lshl_add_u64 v[20:21], v[48:49], 0, s[8:9]
global_load_dwordx4 v[20:23], v[20:21], off
.LBB0_2:
s_waitcnt vmcnt(0)
v_cvt_f32_f16_sdwa v59, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
v_cvt_f32_f16_sdwa v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
v_cvt_f32_f16_e32 v58, v39
v_cvt_f32_f16_e32 v56, v38
v_cvt_f32_f16_sdwa v55, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
v_cvt_f32_f16_sdwa v53, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
v_cvt_f32_f16_e32 v54, v37
v_cvt_f32_f16_e32 v52, v36
s_barrier
ds_write_b128 v41, v[56:59] offset:16
ds_write_b128 v41, v[52:55]
s_waitcnt lgkmcnt(0)
s_barrier
ds_read_b128 v[36:39], v50
ds_read_b128 v[52:55], v50 offset:16
v_cvt_f32_f16_sdwa v56, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
s_add_u32 s8, s8, 0x1000
s_addc_u32 s9, s9, 0
s_mov_b64 s[14:15], 0
s_waitcnt lgkmcnt(1)
v_mul_f32_e32 v37, v37, v56
v_fma_mix_f32 v36, v36, v32, v37 op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v38, v33, v36 op_sel_hi:[0,1,0]
v_fma_mix_f32 v57, v39, v33, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[36:39], v50 offset:8224
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v52, v52, v34, v57 op_sel_hi:[0,1,0]
v_fma_mix_f32 v52, v53, v34, v52 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v52, v54, v35, v52 op_sel_hi:[0,1,0]
v_fma_mix_f32 v52, v55, v35, v52 op_sel:[0,1,0] op_sel_hi:[0,1,0]
s_waitcnt lgkmcnt(0)
v_mul_f32_e32 v37, v37, v56
v_fma_mix_f32 v36, v36, v32, v37 op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v38, v33, v36 op_sel_hi:[0,1,0]
v_fma_mix_f32 v53, v39, v33, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[36:39], v50 offset:8240
v_cvt_f32_f16_e32 v57, v18
s_andn2_b64 vcc, exec, s[10:11]
s_waitcnt lgkmcnt(0)
v_fma_mix_f32 v36, v36, v34, v53 op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v37, v34, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v38, v35, v36 op_sel_hi:[0,1,0]
v_fma_mix_f32 v53, v39, v35, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[36:39], v50 offset:16448
s_waitcnt lgkmcnt(0)
v_mul_f32_e32 v37, v37, v56
v_fma_mix_f32 v36, v36, v32, v37 op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v38, v33, v36 op_sel_hi:[0,1,0]
v_fma_mix_f32 v54, v39, v33, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[36:39], v50 offset:16464
s_waitcnt lgkmcnt(0)
v_fma_mix_f32 v36, v36, v34, v54 op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v37, v34, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v38, v35, v36 op_sel_hi:[0,1,0]
v_fma_mix_f32 v54, v39, v35, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[36:39], v50 offset:24672
s_waitcnt lgkmcnt(0)
v_mul_f32_e32 v37, v37, v56
v_fma_mix_f32 v32, v36, v32, v37 op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v38, v33, v32 op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v39, v33, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[36:39], v50 offset:24688
v_cvt_f32_f16_sdwa v56, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
s_waitcnt lgkmcnt(0)
v_fma_mix_f32 v32, v36, v34, v32 op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v37, v34, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v38, v35, v32 op_sel_hi:[0,1,0]
v_fma_mix_f32 v55, v39, v35, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[32:35], v50 offset:2048
ds_read_b128 v[36:39], v50 offset:2064
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v32, v32, v28, v52 op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v33, v28, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v34, v29, v32 op_sel_hi:[0,1,0]
v_fma_mix_f32 v52, v35, v29, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[32:35], v50 offset:4096
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v36, v36, v30, v52 op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v37, v30, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v38, v31, v36 op_sel_hi:[0,1,0]
v_fma_mix_f32 v52, v39, v31, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[36:39], v50 offset:4112
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v32, v32, v24, v52 op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v33, v24, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v34, v25, v32 op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v35, v25, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
s_waitcnt lgkmcnt(0)
v_fma_mix_f32 v36, v36, v26, v32 op_sel_hi:[0,1,0]
ds_read_b128 v[32:35], v50 offset:10272
v_fma_mix_f32 v36, v37, v26, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v38, v27, v36 op_sel_hi:[0,1,0]
v_fma_mix_f32 v52, v39, v27, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[36:39], v50 offset:10288
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v32, v32, v28, v53 op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v33, v28, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v34, v29, v32 op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v35, v29, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
s_waitcnt lgkmcnt(0)
v_fma_mix_f32 v36, v36, v30, v32 op_sel_hi:[0,1,0]
ds_read_b128 v[32:35], v50 offset:18496
v_fma_mix_f32 v36, v37, v30, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v38, v31, v36 op_sel_hi:[0,1,0]
v_fma_mix_f32 v53, v39, v31, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[36:39], v50 offset:18512
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v32, v32, v28, v54 op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v33, v28, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v34, v29, v32 op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v35, v29, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
s_waitcnt lgkmcnt(0)
v_fma_mix_f32 v36, v36, v30, v32 op_sel_hi:[0,1,0]
ds_read_b128 v[32:35], v50 offset:26720
v_fma_mix_f32 v36, v37, v30, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v38, v31, v36 op_sel_hi:[0,1,0]
v_fma_mix_f32 v54, v39, v31, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[36:39], v50 offset:26736
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v32, v32, v28, v55 op_sel_hi:[0,1,0]
v_fma_mix_f32 v28, v33, v28, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v28, v34, v29, v28 op_sel_hi:[0,1,0]
v_fma_mix_f32 v28, v35, v29, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[32:35], v50 offset:12320
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v28, v36, v30, v28 op_sel_hi:[0,1,0]
v_fma_mix_f32 v28, v37, v30, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v28, v38, v31, v28 op_sel_hi:[0,1,0]
v_fma_mix_f32 v55, v39, v31, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[28:31], v50 offset:12336
ds_read_b128 v[36:39], v50 offset:6144
s_waitcnt lgkmcnt(2)
v_fma_mix_f32 v32, v32, v24, v53 op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v33, v24, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v32, v34, v25, v32 op_sel_hi:[0,1,0]
v_fma_mix_f32 v53, v35, v25, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[32:35], v50 offset:6160
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v36, v36, v16, v52 op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v37, v16, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v38, v17, v36 op_sel_hi:[0,1,0]
v_fma_mix_f32 v52, v39, v17, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[36:39], v50 offset:20544
v_fma_mix_f32 v28, v28, v26, v53 op_sel_hi:[0,1,0]
v_fma_mix_f32 v28, v29, v26, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v28, v30, v27, v28 op_sel_hi:[0,1,0]
v_fma_mix_f32 v53, v31, v27, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[28:31], v50 offset:20560
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v36, v36, v24, v54 op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v37, v24, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v38, v25, v36 op_sel_hi:[0,1,0]
v_fma_mix_f32 v54, v39, v25, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[36:39], v50 offset:28768
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v28, v28, v26, v54 op_sel_hi:[0,1,0]
v_fma_mix_f32 v28, v29, v26, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v28, v30, v27, v28 op_sel_hi:[0,1,0]
v_fma_mix_f32 v54, v31, v27, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[28:31], v50 offset:28784
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v36, v36, v24, v55 op_sel_hi:[0,1,0]
v_fma_mix_f32 v24, v37, v24, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v24, v38, v25, v24 op_sel_hi:[0,1,0]
v_fma_mix_f32 v24, v39, v25, v24 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[36:39], v50 offset:14368
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v24, v28, v26, v24 op_sel_hi:[0,1,0]
v_fma_mix_f32 v24, v29, v26, v24 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v24, v30, v27, v24 op_sel_hi:[0,1,0]
v_fma_mix_f32 v55, v31, v27, v24 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[24:27], v50 offset:14384
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v36, v36, v16, v53 op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v37, v16, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v36, v38, v17, v36 op_sel_hi:[0,1,0]
ds_read_b128 v[28:31], v50 offset:22592
v_fma_mix_f32 v53, v39, v17, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
ds_read_b128 v[36:39], v50 offset:30816
s_waitcnt lgkmcnt(1)
v_fma_mix_f32 v28, v28, v16, v54 op_sel_hi:[0,1,0]
v_fma_mix_f32 v28, v29, v16, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0]
s_waitcnt lgkmcnt(0)
v_fma_mix_f32 v36, v36, v16, v55 op_sel_hi:[0,1,0]
v_fma_mix_f32 v16, v37, v16, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v28, v30, v17, v28 op_sel_hi:[0,1,0]
v_fma_mix_f32 v16, v38, v17, v16 op_sel_hi:[0,1,0]
v_fma_mix_f32 v54, v31, v17, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_fma_mix_f32 v55, v39, v17, v16 op_sel:[0,1,0] op_sel_hi:[0,1,0]
v_mul_f32_e32 v16, v32, v57
v_mul_f32_e32 v17, v24, v57
ds_read_b128 v[28:31], v50 offset:22608
v_cvt_f32_f16_sdwa v36, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
v_cvt_f32_f16_e32 v38, v19
v_pk_add_f32 v[52:53], v[16:17], v[52:53]
ds_read_b128 v[16:19], v50 offset:30832
s_waitcnt lgkmcnt(1)
v_mul_f32_e32 v58, v28, v57
v_mov_b32_e32 v24, v33
v_pk_fma_f32 v[24:25], v[24:25], v[36:37], v[52:53] op_sel_hi:[1,0,1]
v_mov_b32_e32 v28, v34
s_waitcnt lgkmcnt(0)
v_mul_f32_e32 v59, v16, v57
v_pk_add_f32 v[54:55], v[58:59], v[54:55]
v_mov_b32_e32 v16, v29
v_mov_b32_e32 v29, v26
v_pk_fma_f32 v[16:17], v[16:17], v[36:37], v[54:55] op_sel_hi:[1,0,1]
v_pk_fma_f32 v[24:25], v[28:29], v[38:39], v[24:25] op_sel_hi:[1,0,1]
v_mov_b32_e32 v28, v30
v_mov_b32_e32 v29, v18
v_pk_fma_f32 v[16:17], v[28:29], v[38:39], v[16:17] op_sel_hi:[1,0,1]
v_mov_b32_e32 v18, v31
v_mov_b32_e32 v26, v35
v_pk_fma_f32 v[16:17], v[18:19], v[56:57], v[16:17] op_sel_hi:[1,0,1]
v_pk_fma_f32 v[18:19], v[26:27], v[56:57], v[24:25] op_sel_hi:[1,0,1]
ds_bpermute_b32 v24, v51, v18
ds_bpermute_b32 v26, v51, v16
ds_bpermute_b32 v27, v51, v17
ds_bpermute_b32 v25, v51, v19
v_mov_b64_e32 v[38:39], v[22:23]
v_mov_b64_e32 v[34:35], v[2:3]
v_mov_b64_e32 v[30:31], v[6:7]
s_waitcnt lgkmcnt(1)
v_pk_add_f32 v[16:17], v[16:17], v[26:27]
s_waitcnt lgkmcnt(0)
v_pk_add_f32 v[18:19], v[18:19], v[24:25]
ds_swizzle_b32 v24, v18 offset:swizzle(SWAP,16)
ds_swizzle_b32 v26, v16 offset:swizzle(SWAP,16)
ds_swizzle_b32 v27, v17 offset:swizzle(SWAP,16)
ds_swizzle_b32 v25, v19 offset:swizzle(SWAP,16)
v_mov_b64_e32 v[36:37], v[20:21]
v_mov_b64_e32 v[32:33], v[0:1]
v_mov_b64_e32 v[28:29], v[4:5]
s_waitcnt lgkmcnt(1)
v_pk_add_f32 v[16:17], v[16:17], v[26:27]
s_waitcnt lgkmcnt(0)
v_pk_add_f32 v[18:19], v[18:19], v[24:25]
ds_swizzle_b32 v24, v18 offset:swizzle(SWAP,8)
ds_swizzle_b32 v26, v16 offset:swizzle(SWAP,8)
ds_swizzle_b32 v27, v17 offset:swizzle(SWAP,8)
ds_swizzle_b32 v25, v19 offset:swizzle(SWAP,8)
s_waitcnt lgkmcnt(1)
v_pk_add_f32 v[16:17], v[16:17], v[26:27]
s_waitcnt lgkmcnt(0)
v_pk_add_f32 v[18:19], v[18:19], v[24:25]
ds_swizzle_b32 v24, v18 offset:swizzle(SWAP,4)
ds_swizzle_b32 v26, v16 offset:swizzle(SWAP,4)
ds_swizzle_b32 v27, v17 offset:swizzle(SWAP,4)
ds_swizzle_b32 v25, v19 offset:swizzle(SWAP,4)
s_waitcnt lgkmcnt(1)
v_pk_add_f32 v[16:17], v[16:17], v[26:27]
s_waitcnt lgkmcnt(0)
v_pk_add_f32 v[18:19], v[18:19], v[24:25]
ds_swizzle_b32 v24, v18 offset:swizzle(SWAP,2)
ds_swizzle_b32 v26, v16 offset:swizzle(SWAP,2)
ds_swizzle_b32 v27, v17 offset:swizzle(SWAP,2)
ds_swizzle_b32 v25, v19 offset:swizzle(SWAP,2)
s_waitcnt lgkmcnt(1)
v_pk_add_f32 v[16:17], v[16:17], v[26:27]
s_waitcnt lgkmcnt(0)
v_pk_add_f32 v[18:19], v[18:19], v[24:25]
ds_swizzle_b32 v24, v18 offset:swizzle(SWAP,1)
ds_swizzle_b32 v25, v19 offset:swizzle(SWAP,1)
ds_swizzle_b32 v26, v16 offset:swizzle(SWAP,1)
ds_swizzle_b32 v27, v17 offset:swizzle(SWAP,1)
s_waitcnt lgkmcnt(2)
v_pk_add_f32 v[18:19], v[18:19], v[24:25]
s_nop 0
v_pk_add_f32 v[42:43], v[42:43], v[18:19]
s_waitcnt lgkmcnt(0)
v_pk_add_f32 v[16:17], v[16:17], v[26:27]
v_mov_b64_e32 v[26:27], v[10:11]
v_pk_add_f32 v[46:47], v[46:47], v[16:17]
v_mov_b64_e32 v[18:19], v[14:15]
v_mov_b64_e32 v[24:25], v[8:9]
v_mov_b64_e32 v[16:17], v[12:13]
s_cbranch_vccz .LBB0_14
.LBB0_3:
s_xor_b64 s[10:11], s[14:15], -1
v_lshl_add_u64 v[12:13], v[44:45], 0, s[8:9]
s_mov_b32 s5, s4
s_mov_b32 s6, s4
s_mov_b32 s7, s4
s_and_b64 vcc, exec, s[10:11]
s_cbranch_vccnz .LBB0_6
v_add_co_u32_e32 v0, vcc, 0x1000, v12
s_nop 1
v_addc_co_u32_e32 v1, vcc, 0, v13, vcc
global_load_dwordx4 v[0:3], v[0:1], off
v_cndmask_b32_e64 v4, 0, 1, s[14:15]
v_cmp_ne_u32_e64 s[0:1], 1, v4
s_andn2_b64 vcc, exec, s[14:15]
s_cbranch_vccz .LBB0_7
.LBB0_5:
v_mov_b64_e32 v[4:5], s[4:5]
v_mov_b64_e32 v[6:7], s[6:7]
s_branch .LBB0_8
.LBB0_6:
v_mov_b64_e32 v[0:1], s[4:5]
v_mov_b64_e32 v[2:3], s[6:7]
v_cndmask_b32_e64 v4, 0, 1, s[14:15]
v_cmp_ne_u32_e64 s[0:1], 1, v4
s_andn2_b64 vcc, exec, s[14:15]
s_cbranch_vccnz .LBB0_5
.LBB0_7:
v_add_co_u32_e32 v4, vcc, 0x1000, v12
s_nop 1
v_addc_co_u32_e32 v5, vcc, 0, v13, vcc
global_load_dwordx4 v[4:7], v[4:5], off offset:1024
.LBB0_8:
s_mov_b32 s5, s4
s_mov_b32 s6, s4
s_and_b64 vcc, exec, s[0:1]
s_mov_b32 s7, s4
s_cbranch_vccnz .LBB0_11
v_add_co_u32_e32 v8, vcc, 0x1000, v12
s_nop 1
v_addc_co_u32_e32 v9, vcc, 0, v13, vcc
global_load_dwordx4 v[8:11], v[8:9], off offset:2048
s_and_b64 vcc, exec, s[0:1]
s_cbranch_vccz .LBB0_12
.LBB0_10:
v_mov_b64_e32 v[14:15], s[6:7]
v_mov_b64_e32 v[12:13], s[4:5]
s_and_b64 vcc, exec, s[0:1]
s_cbranch_vccz .LBB0_1
s_branch .LBB0_13
.LBB0_11:
v_mov_b64_e32 v[10:11], s[6:7]
v_mov_b64_e32 v[8:9], s[4:5]
s_and_b64 vcc, exec, s[0:1]
s_cbranch_vccnz .LBB0_10
.LBB0_12:
v_add_co_u32_e32 v12, vcc, 0x1000, v12
s_nop 1
v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
global_load_dwordx4 v[12:15], v[12:13], off offset:3072
s_and_b64 vcc, exec, s[0:1]
s_cbranch_vccz .LBB0_1
.LBB0_13:
s_mov_b32 s6, s4
s_mov_b32 s7, s4
s_mov_b32 s5, s4
v_mov_b64_e32 v[22:23], s[6:7]
v_mov_b64_e32 v[20:21], s[4:5]
s_branch .LBB0_2
.LBB0_14:
s_lshl_b32 s0, s2, 12
s_ashr_i32 s1, s0, 31
s_or_b32 s2, s0, 0x800
s_or_b32 s4, s0, 0xc00
s_lshl_b64 s[0:1], s[0:1], 1
s_add_u32 s0, s12, s0
s_addc_u32 s1, s13, s1
s_ashr_i32 s3, s2, 31
s_lshl_b64 s[2:3], s[2:3], 1
s_add_u32 s2, s12, s2
s_addc_u32 s3, s13, s3
s_ashr_i32 s5, s4, 31
v_cvt_f16_f32_e32 v6, v42
s_lshl_b64 s[4:5], s[4:5], 1
v_ashrrev_i32_e32 v41, 31, v40
v_cvt_f16_f32_e32 v7, v43
s_add_u32 s4, s12, s4
v_lshlrev_b64 v[0:1], 1, v[40:41]
v_cvt_f16_f32_e32 v8, v46
s_addc_u32 s5, s13, s5
v_lshl_add_u64 v[2:3], s[0:1], 0, v[0:1]
v_cvt_f16_f32_e32 v9, v47
v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
v_lshl_add_u64 v[0:1], s[4:5], 0, v[0:1]
global_store_short v[2:3], v6, off
global_store_short v[2:3], v7, off offset:2048
global_store_short v[4:5], v8, off
global_store_short v[0:1], v9, off
s_endpgm
.section .rodata,"a",@progbits
.p2align 6, 0x0
.amdhsa_kernel matvec
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 24
.amdhsa_user_sgpr_count 2
.amdhsa_user_sgpr_dispatch_ptr 0
.amdhsa_user_sgpr_queue_ptr 0
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_user_sgpr_dispatch_id 0
.amdhsa_user_sgpr_kernarg_preload_length 0
.amdhsa_user_sgpr_kernarg_preload_offset 0
.amdhsa_user_sgpr_private_segment_size 0
.amdhsa_enable_private_segment 0
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 1
.amdhsa_system_sgpr_workgroup_id_z 0
.amdhsa_system_sgpr_workgroup_info 0
.amdhsa_system_vgpr_workitem_id 0
.amdhsa_next_free_vgpr 60
.amdhsa_next_free_sgpr 16
.amdhsa_accum_offset 60
.amdhsa_reserve_vcc 1
.amdhsa_reserve_xnack_mask 1
.amdhsa_float_round_mode_32 0
.amdhsa_float_round_mode_16_64 0
.amdhsa_float_denorm_mode_32 3
.amdhsa_float_denorm_mode_16_64 3
.amdhsa_dx10_clamp 1
.amdhsa_ieee_mode 1
.amdhsa_fp16_overflow 0
.amdhsa_tg_split 0
.amdhsa_exception_fp_ieee_invalid_op 0
.amdhsa_exception_fp_denorm_src 0
.amdhsa_exception_fp_ieee_div_zero 0
.amdhsa_exception_fp_ieee_overflow 0
.amdhsa_exception_fp_ieee_underflow 0
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.Lfunc_end0:
.size matvec, .Lfunc_end0-matvec
.p2alignl 6, 3212836864
.fill 256, 4, 3212836864
.section ".note.GNU-stack","",@progbits
.amdgpu_metadata
---
amdhsa.kernels:
- .agpr_count: 0
.args:
- .address_space: global
.offset: 0
.size: 8
.value_kind: global_buffer
- .address_space: global
.offset: 8
.size: 8
.value_kind: global_buffer
- .address_space: global
.offset: 16
.size: 8
.value_kind: global_buffer
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 24
.max_flat_workgroup_size: 1024
.name: matvec
.private_segment_fixed_size: 0
.sgpr_count: 22
.sgpr_spill_count: 0
.symbol: matvec.kd
.vgpr_count: 60
.vgpr_spill_count: 0
.wavefront_size: 64
amdhsa.target: amdgcn-amd-amdhsa--gfx942
amdhsa.version:
- 1
- 1
...
.end_amdgpu_metadata
Traceback (most recent call last):
File "/data/reduction/matvec.py", line 57, in <module>
triton.testing.assert_close(c, ref)
File "/data/triton/.venv/lib/python3.11/site-packages/triton/testing.py", line 212, in assert_close
np.testing.assert_allclose(x, y, atol=atol, rtol=rtol, equal_nan=True)
File "/data/triton/.venv/lib/python3.11/site-packages/numpy/testing/_private/utils.py", line 1504, in assert_allclose
assert_array_compare(compare, actual, desired, err_msg=str(err_msg),
File "/home/mirror/.pyenv/versions/3.11.5/lib/python3.11/contextlib.py", line 81, in inner
return func(*args, **kwds)
^^^^^^^^^^^^^^^^^^^
File "/data/triton/.venv/lib/python3.11/site-packages/numpy/testing/_private/utils.py", line 797, in assert_array_compare
raise AssertionError(msg)
AssertionError:
Not equal to tolerance rtol=0, atol=0.01
Mismatched elements: 3 / 8192 (0.0366%)
Max absolute difference: 0.03125
Max relative difference: 0.002094
x: array([[ 109.44 , -112.3 , 22.38 , ..., 24.73 , 172.4 , -68.25 ],
[ 43.1 , -72.7 , 28.06 , ..., 40.94 , 107.75 , 40.9 ],
[ 19.78 , -20.12 , -28.95 , ..., -77.1 , 92.9 , 68.1 ],...
y: array([[ 109.44 , -112.3 , 22.38 , ..., 24.73 , 172.4 , -68.25 ],
[ 43.1 , -72.7 , 28.06 , ..., 40.94 , 107.75 , 40.9 ],
[ 19.78 , -20.12 , -28.95 , ..., -77.1 , 92.9 , 68.1 ],...
import torch
import triton
import triton.language as tl
@triton.jit
def matvec(
a_ptr, b_ptr, c_ptr,
M: tl.constexpr, N: tl.constexpr, K: tl.constexpr,
BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, TILE_SIZE_K: tl.constexpr,
):
# Assume aligned shapes w.r.t. tile sizes
tl.static_assert(M % BLOCK_SIZE_M == 0)
tl.static_assert(N % BLOCK_SIZE_N == 0)
tl.static_assert(K % TILE_SIZE_K == 0)
pid_m = tl.program_id(0)
pid_n = tl.program_id(1)
offset_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
offset_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
offset_k = tl.arange(0, TILE_SIZE_K)
a_ptrs = a_ptr + offset_m[:, None] * K + offset_k[None, :]
b_ptrs = b_ptr + offset_n[:, None] * K + offset_k[None, :]
acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
for k in range(0, tl.cdiv(K, TILE_SIZE_K)):
a = tl.load(a_ptrs)
b = tl.load(b_ptrs)
a_bc = a.reshape(BLOCK_SIZE_M, 1, TILE_SIZE_K).broadcast_to(BLOCK_SIZE_M, BLOCK_SIZE_N, TILE_SIZE_K)
b_bc = b.reshape(1, BLOCK_SIZE_N, TILE_SIZE_K).broadcast_to(BLOCK_SIZE_M, BLOCK_SIZE_N, TILE_SIZE_K)
a_f32 = a_bc.to(tl.float32)
b_f32 = b_bc.to(tl.float32)
acc += tl.sum(a_f32 * b_f32, axis=-1)
a_ptrs += TILE_SIZE_K
b_ptrs += TILE_SIZE_K
c_ptrs = c_ptr + offset_m[:, None] * N + offset_n[None, :]
c = acc.to(tl.float16)
tl.store(c_ptrs, c)
m = 8
n = 1024
k = 4096
a = torch.randn((m, k), device='cuda', dtype=torch.float16)
b = torch.randn((n, k), device='cuda', dtype=torch.float16)
c = torch.zeros((m, n), device='cuda', dtype=torch.float16)
n_warps = 16
block_m = 4
block_n = n_warps
tile_k = 2048
grid = (triton.cdiv(m, block_m), triton.cdiv(n, block_n))
matvec[grid](a, b, c, m, n, k, block_m, block_n, tile_k, num_warps=n_warps)
ref = torch.matmul(a, torch.transpose(b, 0, 1))
triton.testing.assert_close(c, ref)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment