Last active
August 2, 2024 01:46
-
-
Save antiagainst/e2d3d74b9f10d1d6b27c40badba0f7aa to your computer and use it in GitHub Desktop.
matvec in triton
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump Before ConvertTritonToTritonGPU (convert-triton-to-tritongpu) ('builtin.module' operation) //----- // | |
#loc = loc(unknown) | |
module { | |
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} { | |
%c2_i32 = arith.constant 2 : i32 loc(#loc) | |
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32> loc(#loc) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc) | |
%cst_0 = arith.constant dense<1024> : tensor<4x1xi32> loc(#loc) | |
%cst_1 = arith.constant dense<2048> : tensor<16x2048xi32> loc(#loc) | |
%cst_2 = arith.constant dense<2048> : tensor<4x2048xi32> loc(#loc) | |
%cst_3 = arith.constant dense<4096> : tensor<16x1xi32> loc(#loc) | |
%cst_4 = arith.constant dense<4096> : tensor<4x1xi32> loc(#loc) | |
%c16_i32 = arith.constant 16 : i32 loc(#loc) | |
%c4_i32 = arith.constant 4 : i32 loc(#loc) | |
%0 = tt.get_program_id x : i32 loc(#loc) | |
%1 = tt.get_program_id y : i32 loc(#loc) | |
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc) | |
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc) | |
%4 = tt.splat %2 : i32 -> tensor<4xi32> loc(#loc) | |
%5 = arith.addi %4, %3 : tensor<4xi32> loc(#loc) | |
%6 = arith.muli %1, %c16_i32 : i32 loc(#loc) | |
%7 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc) | |
%8 = tt.splat %6 : i32 -> tensor<16xi32> loc(#loc) | |
%9 = arith.addi %8, %7 : tensor<16xi32> loc(#loc) | |
%10 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc) | |
%11 = tt.expand_dims %5 {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc) | |
%12 = arith.muli %11, %cst_4 : tensor<4x1xi32> loc(#loc) | |
%13 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>> loc(#loc) | |
%14 = tt.addptr %13, %12 : tensor<4x1x!tt.ptr<f16>>, tensor<4x1xi32> loc(#loc) | |
%15 = tt.expand_dims %10 {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc) | |
%16 = tt.broadcast %14 : tensor<4x1x!tt.ptr<f16>> -> tensor<4x2048x!tt.ptr<f16>> loc(#loc) | |
%17 = tt.broadcast %15 : tensor<1x2048xi32> -> tensor<4x2048xi32> loc(#loc) | |
%18 = tt.addptr %16, %17 : tensor<4x2048x!tt.ptr<f16>>, tensor<4x2048xi32> loc(#loc) | |
%19 = tt.expand_dims %9 {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc) | |
%20 = arith.muli %19, %cst_3 : tensor<16x1xi32> loc(#loc) | |
%21 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>> loc(#loc) | |
%22 = tt.addptr %21, %20 : tensor<16x1x!tt.ptr<f16>>, tensor<16x1xi32> loc(#loc) | |
%23 = tt.broadcast %22 : tensor<16x1x!tt.ptr<f16>> -> tensor<16x2048x!tt.ptr<f16>> loc(#loc) | |
%24 = tt.broadcast %15 : tensor<1x2048xi32> -> tensor<16x2048xi32> loc(#loc) | |
%25 = tt.addptr %23, %24 : tensor<16x2048x!tt.ptr<f16>>, tensor<16x2048xi32> loc(#loc) | |
%26:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %18, %arg6 = %25) -> (tensor<4x16xf32>, tensor<4x2048x!tt.ptr<f16>>, tensor<16x2048x!tt.ptr<f16>>) : i32 { | |
%35 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>> loc(#loc) | |
%36 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>> loc(#loc) | |
%37 = tt.reshape %35 {allow_reorder = false} : tensor<4x2048xf16> -> tensor<4x1x2048xf16> loc(#loc) | |
%38 = tt.reshape %36 {allow_reorder = false} : tensor<16x2048xf16> -> tensor<1x16x2048xf16> loc(#loc) | |
%39 = arith.extf %37 : tensor<4x1x2048xf16> to tensor<4x1x2048xf32> loc(#loc) | |
%40 = tt.broadcast %39 : tensor<4x1x2048xf32> -> tensor<4x16x2048xf32> loc(#loc) | |
%41 = arith.extf %38 : tensor<1x16x2048xf16> to tensor<1x16x2048xf32> loc(#loc) | |
%42 = tt.broadcast %41 : tensor<1x16x2048xf32> -> tensor<4x16x2048xf32> loc(#loc) | |
%43 = arith.mulf %40, %42 : tensor<4x16x2048xf32> loc(#loc) | |
%44 = "tt.reduce"(%43) <{axis = 2 : i32}> ({ | |
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): | |
%48 = arith.addf %arg7, %arg8 : f32 loc(#loc) | |
tt.reduce.return %48 : f32 loc(#loc) | |
}) : (tensor<4x16x2048xf32>) -> tensor<4x16xf32> loc(#loc) | |
%45 = arith.addf %arg4, %44 : tensor<4x16xf32> loc(#loc) | |
%46 = tt.addptr %arg5, %cst_2 : tensor<4x2048x!tt.ptr<f16>>, tensor<4x2048xi32> loc(#loc) | |
%47 = tt.addptr %arg6, %cst_1 : tensor<16x2048x!tt.ptr<f16>>, tensor<16x2048xi32> loc(#loc) | |
scf.yield %45, %46, %47 : tensor<4x16xf32>, tensor<4x2048x!tt.ptr<f16>>, tensor<16x2048x!tt.ptr<f16>> loc(#loc) | |
} loc(#loc) | |
%27 = arith.muli %11, %cst_0 : tensor<4x1xi32> loc(#loc) | |
%28 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>> loc(#loc) | |
%29 = tt.addptr %28, %27 : tensor<4x1x!tt.ptr<f16>>, tensor<4x1xi32> loc(#loc) | |
%30 = tt.expand_dims %9 {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc) | |
%31 = tt.broadcast %29 : tensor<4x1x!tt.ptr<f16>> -> tensor<4x16x!tt.ptr<f16>> loc(#loc) | |
%32 = tt.broadcast %30 : tensor<1x16xi32> -> tensor<4x16xi32> loc(#loc) | |
%33 = tt.addptr %31, %32 : tensor<4x16x!tt.ptr<f16>>, tensor<4x16xi32> loc(#loc) | |
%34 = arith.truncf %26#0 : tensor<4x16xf32> to tensor<4x16xf16> loc(#loc) | |
tt.store %33, %34 : tensor<4x16x!tt.ptr<f16>> loc(#loc) | |
tt.return loc(#loc) | |
} loc(#loc) | |
} loc(#loc) | |
// -----// IR Dump Before TritonGPUCoalesce (tritongpu-coalesce) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [16, 1], order = [1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [16, 1], order = [1, 0]}> | |
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 16], order = [1, 0]}> | |
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [16], order = [0]}> | |
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [16, 1], order = [0, 1]}> | |
#blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 16], order = [0, 1]}> | |
#blocked6 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 1, 16], order = [2, 1, 0]}> | |
#loc = loc(unknown) | |
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { | |
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} { | |
%c2_i32 = arith.constant 2 : i32 loc(#loc) | |
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #blocked> loc(#loc) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc) | |
%cst_0 = arith.constant dense<1024> : tensor<4x1xi32, #blocked1> loc(#loc) | |
%cst_1 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked2> loc(#loc) | |
%cst_2 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked2> loc(#loc) | |
%cst_3 = arith.constant dense<4096> : tensor<16x1xi32, #blocked1> loc(#loc) | |
%cst_4 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc) | |
%c16_i32 = arith.constant 16 : i32 loc(#loc) | |
%c4_i32 = arith.constant 4 : i32 loc(#loc) | |
%0 = tt.get_program_id x : i32 loc(#loc) | |
%1 = tt.get_program_id y : i32 loc(#loc) | |
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc) | |
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #blocked3> loc(#loc) | |
%4 = tt.splat %2 : i32 -> tensor<4xi32, #blocked3> loc(#loc) | |
%5 = arith.addi %4, %3 : tensor<4xi32, #blocked3> loc(#loc) | |
%6 = arith.muli %1, %c16_i32 : i32 loc(#loc) | |
%7 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #blocked3> loc(#loc) | |
%8 = tt.splat %6 : i32 -> tensor<16xi32, #blocked3> loc(#loc) | |
%9 = arith.addi %8, %7 : tensor<16xi32, #blocked3> loc(#loc) | |
%10 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #blocked3> loc(#loc) | |
%11 = triton_gpu.convert_layout %5 : tensor<4xi32, #blocked3> -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc) | |
%12 = tt.expand_dims %11 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<4x1xi32, #blocked4> loc(#loc) | |
%13 = triton_gpu.convert_layout %12 : tensor<4x1xi32, #blocked4> -> tensor<4x1xi32, #blocked1> loc(#loc) | |
%14 = arith.muli %13, %cst_4 : tensor<4x1xi32, #blocked1> loc(#loc) | |
%15 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%16 = tt.addptr %15, %14 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc) | |
%17 = triton_gpu.convert_layout %10 : tensor<2048xi32, #blocked3> -> tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc) | |
%18 = tt.expand_dims %17 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x2048xi32, #blocked5> loc(#loc) | |
%19 = triton_gpu.convert_layout %18 : tensor<1x2048xi32, #blocked5> -> tensor<1x2048xi32, #blocked2> loc(#loc) | |
%20 = tt.broadcast %16 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%21 = triton_gpu.convert_layout %20 : tensor<4x2048x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%22 = tt.broadcast %19 : tensor<1x2048xi32, #blocked2> -> tensor<4x2048xi32, #blocked2> loc(#loc) | |
%23 = tt.addptr %21, %22 : tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<4x2048xi32, #blocked2> loc(#loc) | |
%24 = triton_gpu.convert_layout %9 : tensor<16xi32, #blocked3> -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc) | |
%25 = tt.expand_dims %24 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<16x1xi32, #blocked4> loc(#loc) | |
%26 = triton_gpu.convert_layout %25 : tensor<16x1xi32, #blocked4> -> tensor<16x1xi32, #blocked1> loc(#loc) | |
%27 = arith.muli %26, %cst_3 : tensor<16x1xi32, #blocked1> loc(#loc) | |
%28 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%29 = tt.addptr %28, %27 : tensor<16x1x!tt.ptr<f16>, #blocked1>, tensor<16x1xi32, #blocked1> loc(#loc) | |
%30 = tt.broadcast %29 : tensor<16x1x!tt.ptr<f16>, #blocked1> -> tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%31 = triton_gpu.convert_layout %30 : tensor<16x2048x!tt.ptr<f16>, #blocked1> -> tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%32 = tt.broadcast %19 : tensor<1x2048xi32, #blocked2> -> tensor<16x2048xi32, #blocked2> loc(#loc) | |
%33 = tt.addptr %31, %32 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc) | |
%34:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %23, %arg6 = %33) -> (tensor<4x16xf32, #blocked>, tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048x!tt.ptr<f16>, #blocked2>) : i32 { | |
%46 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%47 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%48 = tt.reshape %46 {allow_reorder = false} : tensor<4x2048xf16, #blocked2> -> tensor<4x1x2048xf16, #blocked6> loc(#loc) | |
%49 = tt.reshape %47 {allow_reorder = false} : tensor<16x2048xf16, #blocked2> -> tensor<1x16x2048xf16, #blocked6> loc(#loc) | |
%50 = arith.extf %48 : tensor<4x1x2048xf16, #blocked6> to tensor<4x1x2048xf32, #blocked6> loc(#loc) | |
%51 = tt.broadcast %50 : tensor<4x1x2048xf32, #blocked6> -> tensor<4x16x2048xf32, #blocked6> loc(#loc) | |
%52 = arith.extf %49 : tensor<1x16x2048xf16, #blocked6> to tensor<1x16x2048xf32, #blocked6> loc(#loc) | |
%53 = tt.broadcast %52 : tensor<1x16x2048xf32, #blocked6> -> tensor<4x16x2048xf32, #blocked6> loc(#loc) | |
%54 = arith.mulf %51, %53 : tensor<4x16x2048xf32, #blocked6> loc(#loc) | |
%55 = "tt.reduce"(%54) <{axis = 2 : i32}> ({ | |
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): | |
%60 = arith.addf %arg7, %arg8 : f32 loc(#loc) | |
tt.reduce.return %60 : f32 loc(#loc) | |
}) : (tensor<4x16x2048xf32, #blocked6>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked6}>> loc(#loc) | |
%56 = triton_gpu.convert_layout %55 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked6}>> -> tensor<4x16xf32, #blocked> loc(#loc) | |
%57 = arith.addf %arg4, %56 : tensor<4x16xf32, #blocked> loc(#loc) | |
%58 = tt.addptr %arg5, %cst_2 : tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<4x2048xi32, #blocked2> loc(#loc) | |
%59 = tt.addptr %arg6, %cst_1 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc) | |
scf.yield %57, %58, %59 : tensor<4x16xf32, #blocked>, tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
} loc(#loc) | |
%35 = arith.muli %13, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc) | |
%36 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%37 = tt.addptr %36, %35 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc) | |
%38 = triton_gpu.convert_layout %9 : tensor<16xi32, #blocked3> -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc) | |
%39 = tt.expand_dims %38 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x16xi32, #blocked5> loc(#loc) | |
%40 = triton_gpu.convert_layout %39 : tensor<1x16xi32, #blocked5> -> tensor<1x16xi32, #blocked> loc(#loc) | |
%41 = tt.broadcast %37 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x16x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%42 = triton_gpu.convert_layout %41 : tensor<4x16x!tt.ptr<f16>, #blocked1> -> tensor<4x16x!tt.ptr<f16>, #blocked> loc(#loc) | |
%43 = tt.broadcast %40 : tensor<1x16xi32, #blocked> -> tensor<4x16xi32, #blocked> loc(#loc) | |
%44 = tt.addptr %42, %43 : tensor<4x16x!tt.ptr<f16>, #blocked>, tensor<4x16xi32, #blocked> loc(#loc) | |
%45 = arith.truncf %34#0 : tensor<4x16xf32, #blocked> to tensor<4x16xf16, #blocked> loc(#loc) | |
tt.store %44, %45 : tensor<4x16x!tt.ptr<f16>, #blocked> loc(#loc) | |
tt.return loc(#loc) | |
} loc(#loc) | |
} loc(#loc) | |
// -----// IR Dump Before TritonGPURemoveLayoutConversions (tritongpu-remove-layout-conversions) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [16, 1], order = [1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [16, 1], order = [1, 0]}> | |
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 16], order = [1, 0]}> | |
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [16], order = [0]}> | |
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [16, 1], order = [0, 1]}> | |
#blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 16], order = [0, 1]}> | |
#blocked6 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}> | |
#blocked7 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 1, 16], order = [2, 1, 0]}> | |
#loc = loc(unknown) | |
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { | |
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} { | |
%c2_i32 = arith.constant 2 : i32 loc(#loc) | |
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #blocked> loc(#loc) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc) | |
%cst_0 = arith.constant dense<1024> : tensor<4x1xi32, #blocked1> loc(#loc) | |
%cst_1 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked2> loc(#loc) | |
%cst_2 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked2> loc(#loc) | |
%cst_3 = arith.constant dense<4096> : tensor<16x1xi32, #blocked1> loc(#loc) | |
%cst_4 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc) | |
%c16_i32 = arith.constant 16 : i32 loc(#loc) | |
%c4_i32 = arith.constant 4 : i32 loc(#loc) | |
%0 = tt.get_program_id x : i32 loc(#loc) | |
%1 = tt.get_program_id y : i32 loc(#loc) | |
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc) | |
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #blocked3> loc(#loc) | |
%4 = tt.splat %2 : i32 -> tensor<4xi32, #blocked3> loc(#loc) | |
%5 = arith.addi %4, %3 : tensor<4xi32, #blocked3> loc(#loc) | |
%6 = arith.muli %1, %c16_i32 : i32 loc(#loc) | |
%7 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #blocked3> loc(#loc) | |
%8 = tt.splat %6 : i32 -> tensor<16xi32, #blocked3> loc(#loc) | |
%9 = arith.addi %8, %7 : tensor<16xi32, #blocked3> loc(#loc) | |
%10 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #blocked3> loc(#loc) | |
%11 = triton_gpu.convert_layout %5 : tensor<4xi32, #blocked3> -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc) | |
%12 = tt.expand_dims %11 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<4x1xi32, #blocked4> loc(#loc) | |
%13 = triton_gpu.convert_layout %12 : tensor<4x1xi32, #blocked4> -> tensor<4x1xi32, #blocked1> loc(#loc) | |
%14 = arith.muli %13, %cst_4 : tensor<4x1xi32, #blocked1> loc(#loc) | |
%15 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%16 = tt.addptr %15, %14 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc) | |
%17 = triton_gpu.convert_layout %10 : tensor<2048xi32, #blocked3> -> tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc) | |
%18 = tt.expand_dims %17 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x2048xi32, #blocked5> loc(#loc) | |
%19 = triton_gpu.convert_layout %18 : tensor<1x2048xi32, #blocked5> -> tensor<1x2048xi32, #blocked2> loc(#loc) | |
%20 = tt.broadcast %16 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%21 = triton_gpu.convert_layout %20 : tensor<4x2048x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%22 = tt.broadcast %19 : tensor<1x2048xi32, #blocked2> -> tensor<4x2048xi32, #blocked2> loc(#loc) | |
%23 = tt.addptr %21, %22 : tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<4x2048xi32, #blocked2> loc(#loc) | |
%24 = triton_gpu.convert_layout %9 : tensor<16xi32, #blocked3> -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc) | |
%25 = tt.expand_dims %24 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> -> tensor<16x1xi32, #blocked4> loc(#loc) | |
%26 = triton_gpu.convert_layout %25 : tensor<16x1xi32, #blocked4> -> tensor<16x1xi32, #blocked1> loc(#loc) | |
%27 = arith.muli %26, %cst_3 : tensor<16x1xi32, #blocked1> loc(#loc) | |
%28 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%29 = tt.addptr %28, %27 : tensor<16x1x!tt.ptr<f16>, #blocked1>, tensor<16x1xi32, #blocked1> loc(#loc) | |
%30 = tt.broadcast %29 : tensor<16x1x!tt.ptr<f16>, #blocked1> -> tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%31 = triton_gpu.convert_layout %30 : tensor<16x2048x!tt.ptr<f16>, #blocked1> -> tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%32 = tt.broadcast %19 : tensor<1x2048xi32, #blocked2> -> tensor<16x2048xi32, #blocked2> loc(#loc) | |
%33 = tt.addptr %31, %32 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc) | |
%34:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %23, %arg6 = %33) -> (tensor<4x16xf32, #blocked>, tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048x!tt.ptr<f16>, #blocked2>) : i32 { | |
%48 = triton_gpu.convert_layout %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked2> -> tensor<4x2048x!tt.ptr<f16>, #blocked6> loc(#loc) | |
%49 = tt.load %48 : tensor<4x2048x!tt.ptr<f16>, #blocked6> loc(#loc) | |
%50 = triton_gpu.convert_layout %49 : tensor<4x2048xf16, #blocked6> -> tensor<4x2048xf16, #blocked2> loc(#loc) | |
%51 = triton_gpu.convert_layout %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked2> -> tensor<16x2048x!tt.ptr<f16>, #blocked6> loc(#loc) | |
%52 = tt.load %51 : tensor<16x2048x!tt.ptr<f16>, #blocked6> loc(#loc) | |
%53 = triton_gpu.convert_layout %52 : tensor<16x2048xf16, #blocked6> -> tensor<16x2048xf16, #blocked2> loc(#loc) | |
%54 = tt.reshape %50 {allow_reorder = false} : tensor<4x2048xf16, #blocked2> -> tensor<4x1x2048xf16, #blocked7> loc(#loc) | |
%55 = tt.reshape %53 {allow_reorder = false} : tensor<16x2048xf16, #blocked2> -> tensor<1x16x2048xf16, #blocked7> loc(#loc) | |
%56 = arith.extf %54 : tensor<4x1x2048xf16, #blocked7> to tensor<4x1x2048xf32, #blocked7> loc(#loc) | |
%57 = tt.broadcast %56 : tensor<4x1x2048xf32, #blocked7> -> tensor<4x16x2048xf32, #blocked7> loc(#loc) | |
%58 = arith.extf %55 : tensor<1x16x2048xf16, #blocked7> to tensor<1x16x2048xf32, #blocked7> loc(#loc) | |
%59 = tt.broadcast %58 : tensor<1x16x2048xf32, #blocked7> -> tensor<4x16x2048xf32, #blocked7> loc(#loc) | |
%60 = arith.mulf %57, %59 : tensor<4x16x2048xf32, #blocked7> loc(#loc) | |
%61 = "tt.reduce"(%60) <{axis = 2 : i32}> ({ | |
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): | |
%66 = arith.addf %arg7, %arg8 : f32 loc(#loc) | |
tt.reduce.return %66 : f32 loc(#loc) | |
}) : (tensor<4x16x2048xf32, #blocked7>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked7}>> loc(#loc) | |
%62 = triton_gpu.convert_layout %61 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked7}>> -> tensor<4x16xf32, #blocked> loc(#loc) | |
%63 = arith.addf %arg4, %62 : tensor<4x16xf32, #blocked> loc(#loc) | |
%64 = tt.addptr %arg5, %cst_2 : tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<4x2048xi32, #blocked2> loc(#loc) | |
%65 = tt.addptr %arg6, %cst_1 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc) | |
scf.yield %63, %64, %65 : tensor<4x16xf32, #blocked>, tensor<4x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
} loc(#loc) | |
%35 = arith.muli %13, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc) | |
%36 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%37 = tt.addptr %36, %35 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc) | |
%38 = triton_gpu.convert_layout %9 : tensor<16xi32, #blocked3> -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> loc(#loc) | |
%39 = tt.expand_dims %38 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x16xi32, #blocked5> loc(#loc) | |
%40 = triton_gpu.convert_layout %39 : tensor<1x16xi32, #blocked5> -> tensor<1x16xi32, #blocked> loc(#loc) | |
%41 = tt.broadcast %37 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x16x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%42 = triton_gpu.convert_layout %41 : tensor<4x16x!tt.ptr<f16>, #blocked1> -> tensor<4x16x!tt.ptr<f16>, #blocked> loc(#loc) | |
%43 = tt.broadcast %40 : tensor<1x16xi32, #blocked> -> tensor<4x16xi32, #blocked> loc(#loc) | |
%44 = tt.addptr %42, %43 : tensor<4x16x!tt.ptr<f16>, #blocked>, tensor<4x16xi32, #blocked> loc(#loc) | |
%45 = arith.truncf %34#0 : tensor<4x16xf32, #blocked> to tensor<4x16xf16, #blocked> loc(#loc) | |
%46 = triton_gpu.convert_layout %44 : tensor<4x16x!tt.ptr<f16>, #blocked> -> tensor<4x16x!tt.ptr<f16>, #blocked> loc(#loc) | |
%47 = triton_gpu.convert_layout %45 : tensor<4x16xf16, #blocked> -> tensor<4x16xf16, #blocked> loc(#loc) | |
tt.store %46, %47 : tensor<4x16x!tt.ptr<f16>, #blocked> loc(#loc) | |
tt.return loc(#loc) | |
} loc(#loc) | |
} loc(#loc) | |
// -----// IR Dump Before TritonGPUOptimizeThreadLocality (tritongpu-optimize-thread-locality) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 4, 4], order = [2, 1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}> | |
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [16, 1], order = [1, 0]}> | |
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [4, 1, 4], order = [2, 1, 0]}> | |
#loc = loc(unknown) | |
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { | |
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc) | |
%c4_i32 = arith.constant 4 : i32 loc(#loc) | |
%c16_i32 = arith.constant 16 : i32 loc(#loc) | |
%cst_0 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc) | |
%cst_1 = arith.constant dense<4096> : tensor<16x1xi32, #blocked1> loc(#loc) | |
%cst_2 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked1> loc(#loc) | |
%cst_3 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked1> loc(#loc) | |
%cst_4 = arith.constant dense<1024> : tensor<4x1xi32, #blocked2> loc(#loc) | |
%0 = tt.get_program_id x : i32 loc(#loc) | |
%1 = tt.get_program_id y : i32 loc(#loc) | |
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc) | |
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%4 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%5 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%6 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%7 = arith.addi %5, %3 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%8 = arith.addi %6, %4 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%9 = arith.muli %1, %c16_i32 : i32 loc(#loc) | |
%10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc) | |
%12 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%13 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc) | |
%14 = arith.addi %12, %10 : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%15 = arith.addi %13, %11 : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc) | |
%16 = tt.expand_dims %7 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc) | |
%17 = tt.expand_dims %8 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<4x1xi32, #blocked2> loc(#loc) | |
%18 = arith.muli %16, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc) | |
%19 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%20 = tt.addptr %19, %18 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc) | |
%21 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> loc(#loc) | |
%22 = tt.expand_dims %21 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc) | |
%23 = tt.broadcast %20 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%24 = tt.broadcast %22 : tensor<1x2048xi32, #blocked1> -> tensor<4x2048xi32, #blocked1> loc(#loc) | |
%25 = tt.addptr %23, %24 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc) | |
%26 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<16x1xi32, #blocked1> loc(#loc) | |
%27 = arith.muli %26, %cst_1 : tensor<16x1xi32, #blocked1> loc(#loc) | |
%28 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%29 = tt.addptr %28, %27 : tensor<16x1x!tt.ptr<f16>, #blocked1>, tensor<16x1xi32, #blocked1> loc(#loc) | |
%30 = tt.broadcast %29 : tensor<16x1x!tt.ptr<f16>, #blocked1> -> tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%31 = tt.broadcast %22 : tensor<1x2048xi32, #blocked1> -> tensor<16x2048xi32, #blocked1> loc(#loc) | |
%32 = tt.addptr %30, %31 : tensor<16x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048xi32, #blocked1> loc(#loc) | |
%33:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %25, %arg6 = %32) -> (tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked1>) : i32 { | |
%43 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%44 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%45 = tt.reshape %43 {allow_reorder = false} : tensor<4x2048xf16, #blocked1> -> tensor<4x1x2048xf16, #blocked3> loc(#loc) | |
%46 = tt.reshape %44 {allow_reorder = false} : tensor<16x2048xf16, #blocked1> -> tensor<1x16x2048xf16, #blocked> loc(#loc) | |
%47 = arith.extf %45 : tensor<4x1x2048xf16, #blocked3> to tensor<4x1x2048xf32, #blocked3> loc(#loc) | |
%48 = triton_gpu.convert_layout %47 : tensor<4x1x2048xf32, #blocked3> -> tensor<4x1x2048xf32, #blocked> loc(#loc) | |
%49 = tt.broadcast %48 : tensor<4x1x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%50 = arith.extf %46 : tensor<1x16x2048xf16, #blocked> to tensor<1x16x2048xf32, #blocked> loc(#loc) | |
%51 = tt.broadcast %50 : tensor<1x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%52 = arith.mulf %49, %51 : tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%53 = "tt.reduce"(%52) <{axis = 2 : i32}> ({ | |
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): | |
%57 = arith.addf %arg7, %arg8 : f32 loc(#loc) | |
tt.reduce.return %57 : f32 loc(#loc) | |
}) : (tensor<4x16x2048xf32, #blocked>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%54 = arith.addf %arg4, %53 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%55 = tt.addptr %arg5, %cst_2 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc) | |
%56 = tt.addptr %arg6, %cst_3 : tensor<16x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048xi32, #blocked1> loc(#loc) | |
scf.yield %54, %55, %56 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
} loc(#loc) | |
%34 = arith.muli %17, %cst_4 : tensor<4x1xi32, #blocked2> loc(#loc) | |
%35 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%36 = tt.addptr %35, %34 : tensor<4x1x!tt.ptr<f16>, #blocked2>, tensor<4x1xi32, #blocked2> loc(#loc) | |
%37 = tt.expand_dims %15 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x16xi32, #blocked2> loc(#loc) | |
%38 = tt.broadcast %36 : tensor<4x1x!tt.ptr<f16>, #blocked2> -> tensor<4x16x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%39 = tt.broadcast %37 : tensor<1x16xi32, #blocked2> -> tensor<4x16xi32, #blocked2> loc(#loc) | |
%40 = tt.addptr %38, %39 : tensor<4x16x!tt.ptr<f16>, #blocked2>, tensor<4x16xi32, #blocked2> loc(#loc) | |
%41 = arith.truncf %33#0 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> to tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%42 = triton_gpu.convert_layout %41 : tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16xf16, #blocked2> loc(#loc) | |
tt.store %40, %42 : tensor<4x16x!tt.ptr<f16>, #blocked2> loc(#loc) | |
tt.return loc(#loc) | |
} loc(#loc) | |
} loc(#loc) | |
// -----// IR Dump Before TritonAMDGPUAccelerateMatmul (tritonamdgpu-accelerate-matmul) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 4, 4], order = [2, 1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}> | |
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [16, 1], order = [1, 0]}> | |
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [4, 1, 4], order = [2, 1, 0]}> | |
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 16, 1], order = [2, 1, 0]}> | |
#loc = loc(unknown) | |
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { | |
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc) | |
%c4_i32 = arith.constant 4 : i32 loc(#loc) | |
%c16_i32 = arith.constant 16 : i32 loc(#loc) | |
%cst_0 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc) | |
%cst_1 = arith.constant dense<4096> : tensor<16x1xi32, #blocked1> loc(#loc) | |
%cst_2 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked1> loc(#loc) | |
%cst_3 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked1> loc(#loc) | |
%cst_4 = arith.constant dense<1024> : tensor<4x1xi32, #blocked2> loc(#loc) | |
%0 = tt.get_program_id x : i32 loc(#loc) | |
%1 = tt.get_program_id y : i32 loc(#loc) | |
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc) | |
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%4 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%5 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%6 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%7 = arith.addi %5, %3 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%8 = arith.addi %6, %4 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%9 = arith.muli %1, %c16_i32 : i32 loc(#loc) | |
%10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc) | |
%12 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%13 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc) | |
%14 = arith.addi %12, %10 : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%15 = arith.addi %13, %11 : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc) | |
%16 = tt.expand_dims %7 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc) | |
%17 = tt.expand_dims %8 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<4x1xi32, #blocked2> loc(#loc) | |
%18 = arith.muli %16, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc) | |
%19 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%20 = tt.addptr %19, %18 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc) | |
%21 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> loc(#loc) | |
%22 = tt.expand_dims %21 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc) | |
%23 = tt.broadcast %20 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%24 = tt.broadcast %22 : tensor<1x2048xi32, #blocked1> -> tensor<4x2048xi32, #blocked1> loc(#loc) | |
%25 = tt.addptr %23, %24 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc) | |
%26 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<16x1xi32, #blocked1> loc(#loc) | |
%27 = arith.muli %26, %cst_1 : tensor<16x1xi32, #blocked1> loc(#loc) | |
%28 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%29 = tt.addptr %28, %27 : tensor<16x1x!tt.ptr<f16>, #blocked1>, tensor<16x1xi32, #blocked1> loc(#loc) | |
%30 = tt.broadcast %29 : tensor<16x1x!tt.ptr<f16>, #blocked1> -> tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%31 = tt.broadcast %22 : tensor<1x2048xi32, #blocked1> -> tensor<16x2048xi32, #blocked1> loc(#loc) | |
%32 = tt.addptr %30, %31 : tensor<16x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048xi32, #blocked1> loc(#loc) | |
%33:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %25, %arg6 = %32) -> (tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked1>) : i32 { | |
%43 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%44 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%45 = tt.reshape %43 {allow_reorder = false} : tensor<4x2048xf16, #blocked1> -> tensor<4x1x2048xf16, #blocked3> loc(#loc) | |
%46 = tt.reshape %44 {allow_reorder = false} : tensor<16x2048xf16, #blocked1> -> tensor<1x16x2048xf16, #blocked> loc(#loc) | |
%47 = arith.extf %45 : tensor<4x1x2048xf16, #blocked3> to tensor<4x1x2048xf32, #blocked3> loc(#loc) | |
%48 = triton_gpu.convert_layout %47 : tensor<4x1x2048xf32, #blocked3> -> tensor<4x1x2048xf32, #blocked> loc(#loc) | |
%49 = tt.broadcast %48 : tensor<4x1x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%50 = arith.extf %46 : tensor<1x16x2048xf16, #blocked> to tensor<1x16x2048xf32, #blocked> loc(#loc) | |
%51 = tt.broadcast %50 : tensor<1x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%52 = arith.mulf %49, %51 : tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%53 = triton_gpu.convert_layout %52 : tensor<4x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked4> loc(#loc) | |
%54 = "tt.reduce"(%53) <{axis = 2 : i32}> ({ | |
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): | |
%59 = arith.addf %arg7, %arg8 : f32 loc(#loc) | |
tt.reduce.return %59 : f32 loc(#loc) | |
}) {preserve_layout} : (tensor<4x16x2048xf32, #blocked4>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked4}>> loc(#loc) | |
%55 = triton_gpu.convert_layout %54 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked4}>> -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%56 = arith.addf %arg4, %55 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%57 = tt.addptr %arg5, %cst_2 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc) | |
%58 = tt.addptr %arg6, %cst_3 : tensor<16x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048xi32, #blocked1> loc(#loc) | |
scf.yield %56, %57, %58 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
} loc(#loc) | |
%34 = arith.muli %17, %cst_4 : tensor<4x1xi32, #blocked2> loc(#loc) | |
%35 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%36 = tt.addptr %35, %34 : tensor<4x1x!tt.ptr<f16>, #blocked2>, tensor<4x1xi32, #blocked2> loc(#loc) | |
%37 = tt.expand_dims %15 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x16xi32, #blocked2> loc(#loc) | |
%38 = tt.broadcast %36 : tensor<4x1x!tt.ptr<f16>, #blocked2> -> tensor<4x16x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%39 = tt.broadcast %37 : tensor<1x16xi32, #blocked2> -> tensor<4x16xi32, #blocked2> loc(#loc) | |
%40 = tt.addptr %38, %39 : tensor<4x16x!tt.ptr<f16>, #blocked2>, tensor<4x16xi32, #blocked2> loc(#loc) | |
%41 = arith.truncf %33#0 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> to tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%42 = triton_gpu.convert_layout %41 : tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16xf16, #blocked2> loc(#loc) | |
tt.store %40, %42 : tensor<4x16x!tt.ptr<f16>, #blocked2> loc(#loc) | |
tt.return loc(#loc) | |
} loc(#loc) | |
} loc(#loc) | |
// -----// IR Dump Before TritonGPURemoveLayoutConversions (tritongpu-remove-layout-conversions) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 4, 4], order = [2, 1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}> | |
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [16, 1], order = [1, 0]}> | |
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [4, 1, 4], order = [2, 1, 0]}> | |
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 16, 1], order = [2, 1, 0]}> | |
#loc = loc(unknown) | |
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { | |
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc) | |
%c4_i32 = arith.constant 4 : i32 loc(#loc) | |
%c16_i32 = arith.constant 16 : i32 loc(#loc) | |
%cst_0 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc) | |
%cst_1 = arith.constant dense<4096> : tensor<16x1xi32, #blocked1> loc(#loc) | |
%cst_2 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked1> loc(#loc) | |
%cst_3 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked1> loc(#loc) | |
%cst_4 = arith.constant dense<1024> : tensor<4x1xi32, #blocked2> loc(#loc) | |
%0 = tt.get_program_id x : i32 loc(#loc) | |
%1 = tt.get_program_id y : i32 loc(#loc) | |
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc) | |
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%4 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%5 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%6 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%7 = arith.addi %5, %3 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%8 = arith.addi %6, %4 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%9 = arith.muli %1, %c16_i32 : i32 loc(#loc) | |
%10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc) | |
%12 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%13 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc) | |
%14 = arith.addi %12, %10 : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%15 = arith.addi %13, %11 : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc) | |
%16 = tt.expand_dims %7 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc) | |
%17 = tt.expand_dims %8 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<4x1xi32, #blocked2> loc(#loc) | |
%18 = arith.muli %16, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc) | |
%19 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%20 = tt.addptr %19, %18 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc) | |
%21 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> loc(#loc) | |
%22 = tt.expand_dims %21 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc) | |
%23 = tt.broadcast %20 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%24 = tt.broadcast %22 : tensor<1x2048xi32, #blocked1> -> tensor<4x2048xi32, #blocked1> loc(#loc) | |
%25 = tt.addptr %23, %24 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc) | |
%26 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<16x1xi32, #blocked1> loc(#loc) | |
%27 = arith.muli %26, %cst_1 : tensor<16x1xi32, #blocked1> loc(#loc) | |
%28 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%29 = tt.addptr %28, %27 : tensor<16x1x!tt.ptr<f16>, #blocked1>, tensor<16x1xi32, #blocked1> loc(#loc) | |
%30 = tt.broadcast %29 : tensor<16x1x!tt.ptr<f16>, #blocked1> -> tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%31 = tt.broadcast %22 : tensor<1x2048xi32, #blocked1> -> tensor<16x2048xi32, #blocked1> loc(#loc) | |
%32 = tt.addptr %30, %31 : tensor<16x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048xi32, #blocked1> loc(#loc) | |
%33:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %25, %arg6 = %32) -> (tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked1>) : i32 { | |
%43 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%44 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%45 = tt.reshape %43 {allow_reorder = false} : tensor<4x2048xf16, #blocked1> -> tensor<4x1x2048xf16, #blocked3> loc(#loc) | |
%46 = tt.reshape %44 {allow_reorder = false} : tensor<16x2048xf16, #blocked1> -> tensor<1x16x2048xf16, #blocked> loc(#loc) | |
%47 = arith.extf %45 : tensor<4x1x2048xf16, #blocked3> to tensor<4x1x2048xf32, #blocked3> loc(#loc) | |
%48 = triton_gpu.convert_layout %47 : tensor<4x1x2048xf32, #blocked3> -> tensor<4x1x2048xf32, #blocked> loc(#loc) | |
%49 = tt.broadcast %48 : tensor<4x1x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%50 = arith.extf %46 : tensor<1x16x2048xf16, #blocked> to tensor<1x16x2048xf32, #blocked> loc(#loc) | |
%51 = tt.broadcast %50 : tensor<1x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%52 = arith.mulf %49, %51 : tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%53 = triton_gpu.convert_layout %52 : tensor<4x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked4> loc(#loc) | |
%54 = "tt.reduce"(%53) <{axis = 2 : i32}> ({ | |
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): | |
%59 = arith.addf %arg7, %arg8 : f32 loc(#loc) | |
tt.reduce.return %59 : f32 loc(#loc) | |
}) {preserve_layout} : (tensor<4x16x2048xf32, #blocked4>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked4}>> loc(#loc) | |
%55 = triton_gpu.convert_layout %54 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked4}>> -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%56 = arith.addf %arg4, %55 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%57 = tt.addptr %arg5, %cst_2 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc) | |
%58 = tt.addptr %arg6, %cst_3 : tensor<16x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048xi32, #blocked1> loc(#loc) | |
scf.yield %56, %57, %58 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
} loc(#loc) | |
%34 = arith.muli %17, %cst_4 : tensor<4x1xi32, #blocked2> loc(#loc) | |
%35 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%36 = tt.addptr %35, %34 : tensor<4x1x!tt.ptr<f16>, #blocked2>, tensor<4x1xi32, #blocked2> loc(#loc) | |
%37 = tt.expand_dims %15 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x16xi32, #blocked2> loc(#loc) | |
%38 = tt.broadcast %36 : tensor<4x1x!tt.ptr<f16>, #blocked2> -> tensor<4x16x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%39 = tt.broadcast %37 : tensor<1x16xi32, #blocked2> -> tensor<4x16xi32, #blocked2> loc(#loc) | |
%40 = tt.addptr %38, %39 : tensor<4x16x!tt.ptr<f16>, #blocked2>, tensor<4x16xi32, #blocked2> loc(#loc) | |
%41 = arith.truncf %33#0 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> to tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%42 = triton_gpu.convert_layout %41 : tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16xf16, #blocked2> loc(#loc) | |
tt.store %40, %42 : tensor<4x16x!tt.ptr<f16>, #blocked2> loc(#loc) | |
tt.return loc(#loc) | |
} loc(#loc) | |
} loc(#loc) | |
// -----// IR Dump Before TritonAMDGPUOptimizeEpilogue (tritonamdgpu-optimize-epilogue) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 16, 1], order = [2, 1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}> | |
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [16, 1], order = [1, 0]}> | |
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [4, 1, 4], order = [2, 1, 0]}> | |
#loc = loc(unknown) | |
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { | |
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc) | |
%c4_i32 = arith.constant 4 : i32 loc(#loc) | |
%c16_i32 = arith.constant 16 : i32 loc(#loc) | |
%cst_0 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc) | |
%cst_1 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked1> loc(#loc) | |
%cst_2 = arith.constant dense<1024> : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%cst_3 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked2> loc(#loc) | |
%cst_4 = arith.constant dense<4096> : tensor<16x1xi32, #blocked2> loc(#loc) | |
%0 = tt.get_program_id x : i32 loc(#loc) | |
%1 = tt.get_program_id y : i32 loc(#loc) | |
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc) | |
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%4 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%5 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%6 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%7 = arith.addi %5, %3 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%8 = arith.addi %6, %4 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%9 = arith.muli %1, %c16_i32 : i32 loc(#loc) | |
%10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%12 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%13 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%14 = arith.addi %12, %10 : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%15 = arith.addi %13, %11 : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%16 = tt.expand_dims %7 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc) | |
%17 = tt.expand_dims %8 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%18 = arith.muli %16, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc) | |
%19 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%20 = tt.addptr %19, %18 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc) | |
%21 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc) | |
%22 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> loc(#loc) | |
%23 = tt.expand_dims %21 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x2048xi32, #blocked2> loc(#loc) | |
%24 = tt.expand_dims %22 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc) | |
%25 = tt.broadcast %20 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%26 = tt.broadcast %24 : tensor<1x2048xi32, #blocked1> -> tensor<4x2048xi32, #blocked1> loc(#loc) | |
%27 = tt.addptr %25, %26 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc) | |
%28 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1xi32, #blocked2> loc(#loc) | |
%29 = arith.muli %28, %cst_4 : tensor<16x1xi32, #blocked2> loc(#loc) | |
%30 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%31 = tt.addptr %30, %29 : tensor<16x1x!tt.ptr<f16>, #blocked2>, tensor<16x1xi32, #blocked2> loc(#loc) | |
%32 = tt.broadcast %31 : tensor<16x1x!tt.ptr<f16>, #blocked2> -> tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%33 = tt.broadcast %23 : tensor<1x2048xi32, #blocked2> -> tensor<16x2048xi32, #blocked2> loc(#loc) | |
%34 = tt.addptr %32, %33 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc) | |
%35:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %27, %arg6 = %34) -> (tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2>) : i32 { | |
%44 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%45 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%46 = tt.reshape %44 {allow_reorder = false} : tensor<4x2048xf16, #blocked1> -> tensor<4x1x2048xf16, #blocked3> loc(#loc) | |
%47 = tt.reshape %45 {allow_reorder = false} : tensor<16x2048xf16, #blocked2> -> tensor<1x16x2048xf16, #blocked> loc(#loc) | |
%48 = triton_gpu.convert_layout %46 : tensor<4x1x2048xf16, #blocked3> -> tensor<4x1x2048xf16, #blocked> loc(#loc) | |
%49 = arith.extf %48 : tensor<4x1x2048xf16, #blocked> to tensor<4x1x2048xf32, #blocked> loc(#loc) | |
%50 = tt.broadcast %49 : tensor<4x1x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%51 = arith.extf %47 : tensor<1x16x2048xf16, #blocked> to tensor<1x16x2048xf32, #blocked> loc(#loc) | |
%52 = tt.broadcast %51 : tensor<1x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%53 = arith.mulf %50, %52 : tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%54 = "tt.reduce"(%53) <{axis = 2 : i32}> ({ | |
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): | |
%58 = arith.addf %arg7, %arg8 : f32 loc(#loc) | |
tt.reduce.return %58 : f32 loc(#loc) | |
}) {preserve_layout} : (tensor<4x16x2048xf32, #blocked>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%55 = arith.addf %arg4, %54 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%56 = tt.addptr %arg5, %cst_1 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc) | |
%57 = tt.addptr %arg6, %cst_3 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc) | |
scf.yield %55, %56, %57 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
} loc(#loc) | |
%36 = arith.muli %17, %cst_2 : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%37 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%38 = tt.addptr %37, %36 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%39 = tt.expand_dims %15 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%40 = tt.broadcast %38 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%41 = tt.broadcast %39 : tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%42 = tt.addptr %40, %41 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%43 = arith.truncf %35#0 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> to tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
tt.store %42, %43 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
tt.return loc(#loc) | |
} loc(#loc) | |
} loc(#loc) | |
// -----// IR Dump Before TritonGPUOptimizeDotOperands (tritongpu-optimize-dot-operands) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 16, 1], order = [2, 1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}> | |
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [16, 1], order = [1, 0]}> | |
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [4, 1, 4], order = [2, 1, 0]}> | |
#loc = loc(unknown) | |
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { | |
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc) | |
%c4_i32 = arith.constant 4 : i32 loc(#loc) | |
%c16_i32 = arith.constant 16 : i32 loc(#loc) | |
%cst_0 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc) | |
%cst_1 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked1> loc(#loc) | |
%cst_2 = arith.constant dense<1024> : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%cst_3 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked2> loc(#loc) | |
%cst_4 = arith.constant dense<4096> : tensor<16x1xi32, #blocked2> loc(#loc) | |
%0 = tt.get_program_id x : i32 loc(#loc) | |
%1 = tt.get_program_id y : i32 loc(#loc) | |
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc) | |
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%4 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%5 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%6 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%7 = arith.addi %5, %3 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%8 = arith.addi %6, %4 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%9 = arith.muli %1, %c16_i32 : i32 loc(#loc) | |
%10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%12 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%13 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%14 = arith.addi %12, %10 : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%15 = arith.addi %13, %11 : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%16 = tt.expand_dims %7 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc) | |
%17 = tt.expand_dims %8 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%18 = arith.muli %16, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc) | |
%19 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%20 = tt.addptr %19, %18 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc) | |
%21 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc) | |
%22 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> loc(#loc) | |
%23 = tt.expand_dims %21 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x2048xi32, #blocked2> loc(#loc) | |
%24 = tt.expand_dims %22 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc) | |
%25 = tt.broadcast %20 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%26 = tt.broadcast %24 : tensor<1x2048xi32, #blocked1> -> tensor<4x2048xi32, #blocked1> loc(#loc) | |
%27 = tt.addptr %25, %26 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc) | |
%28 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1xi32, #blocked2> loc(#loc) | |
%29 = arith.muli %28, %cst_4 : tensor<16x1xi32, #blocked2> loc(#loc) | |
%30 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%31 = tt.addptr %30, %29 : tensor<16x1x!tt.ptr<f16>, #blocked2>, tensor<16x1xi32, #blocked2> loc(#loc) | |
%32 = tt.broadcast %31 : tensor<16x1x!tt.ptr<f16>, #blocked2> -> tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%33 = tt.broadcast %23 : tensor<1x2048xi32, #blocked2> -> tensor<16x2048xi32, #blocked2> loc(#loc) | |
%34 = tt.addptr %32, %33 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc) | |
%35:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %27, %arg6 = %34) -> (tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2>) : i32 { | |
%44 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%45 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%46 = tt.reshape %44 {allow_reorder = false} : tensor<4x2048xf16, #blocked1> -> tensor<4x1x2048xf16, #blocked3> loc(#loc) | |
%47 = tt.reshape %45 {allow_reorder = false} : tensor<16x2048xf16, #blocked2> -> tensor<1x16x2048xf16, #blocked> loc(#loc) | |
%48 = triton_gpu.convert_layout %46 : tensor<4x1x2048xf16, #blocked3> -> tensor<4x1x2048xf16, #blocked> loc(#loc) | |
%49 = arith.extf %48 : tensor<4x1x2048xf16, #blocked> to tensor<4x1x2048xf32, #blocked> loc(#loc) | |
%50 = tt.broadcast %49 : tensor<4x1x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%51 = arith.extf %47 : tensor<1x16x2048xf16, #blocked> to tensor<1x16x2048xf32, #blocked> loc(#loc) | |
%52 = tt.broadcast %51 : tensor<1x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%53 = arith.mulf %50, %52 : tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%54 = "tt.reduce"(%53) <{axis = 2 : i32}> ({ | |
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): | |
%58 = arith.addf %arg7, %arg8 : f32 loc(#loc) | |
tt.reduce.return %58 : f32 loc(#loc) | |
}) {preserve_layout} : (tensor<4x16x2048xf32, #blocked>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%55 = arith.addf %arg4, %54 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%56 = tt.addptr %arg5, %cst_1 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc) | |
%57 = tt.addptr %arg6, %cst_3 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc) | |
scf.yield %55, %56, %57 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
} loc(#loc) | |
%36 = arith.muli %17, %cst_2 : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%37 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%38 = tt.addptr %37, %36 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%39 = tt.expand_dims %15 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%40 = tt.broadcast %38 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%41 = tt.broadcast %39 : tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%42 = tt.addptr %40, %41 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%43 = arith.truncf %35#0 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> to tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
tt.store %42, %43 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
tt.return loc(#loc) | |
} loc(#loc) | |
} loc(#loc) | |
// -----// IR Dump Before TritonAMDGPUStreamPipelineV2 (tritonamdgpu-stream-pipeline-v2) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 16, 1], order = [2, 1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}> | |
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [16, 1], order = [1, 0]}> | |
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [4, 1, 4], order = [2, 1, 0]}> | |
#loc = loc(unknown) | |
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { | |
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc) | |
%c4_i32 = arith.constant 4 : i32 loc(#loc) | |
%c16_i32 = arith.constant 16 : i32 loc(#loc) | |
%cst_0 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc) | |
%cst_1 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked1> loc(#loc) | |
%cst_2 = arith.constant dense<1024> : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%cst_3 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked2> loc(#loc) | |
%cst_4 = arith.constant dense<4096> : tensor<16x1xi32, #blocked2> loc(#loc) | |
%0 = tt.get_program_id x : i32 loc(#loc) | |
%1 = tt.get_program_id y : i32 loc(#loc) | |
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc) | |
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%4 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%5 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%6 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%7 = arith.addi %5, %3 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%8 = arith.addi %6, %4 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%9 = arith.muli %1, %c16_i32 : i32 loc(#loc) | |
%10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%12 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%13 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%14 = arith.addi %12, %10 : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%15 = arith.addi %13, %11 : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%16 = tt.expand_dims %7 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc) | |
%17 = tt.expand_dims %8 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%18 = arith.muli %16, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc) | |
%19 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%20 = tt.addptr %19, %18 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc) | |
%21 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc) | |
%22 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> loc(#loc) | |
%23 = tt.expand_dims %21 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x2048xi32, #blocked2> loc(#loc) | |
%24 = tt.expand_dims %22 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc) | |
%25 = tt.broadcast %20 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%26 = tt.broadcast %24 : tensor<1x2048xi32, #blocked1> -> tensor<4x2048xi32, #blocked1> loc(#loc) | |
%27 = tt.addptr %25, %26 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc) | |
%28 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1xi32, #blocked2> loc(#loc) | |
%29 = arith.muli %28, %cst_4 : tensor<16x1xi32, #blocked2> loc(#loc) | |
%30 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%31 = tt.addptr %30, %29 : tensor<16x1x!tt.ptr<f16>, #blocked2>, tensor<16x1xi32, #blocked2> loc(#loc) | |
%32 = tt.broadcast %31 : tensor<16x1x!tt.ptr<f16>, #blocked2> -> tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%33 = tt.broadcast %23 : tensor<1x2048xi32, #blocked2> -> tensor<16x2048xi32, #blocked2> loc(#loc) | |
%34 = tt.addptr %32, %33 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc) | |
%35:3 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %27, %arg6 = %34) -> (tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2>) : i32 { | |
%44 = tt.load %arg5 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%45 = tt.load %arg6 : tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%46 = tt.reshape %44 {allow_reorder = false} : tensor<4x2048xf16, #blocked1> -> tensor<4x1x2048xf16, #blocked3> loc(#loc) | |
%47 = tt.reshape %45 {allow_reorder = false} : tensor<16x2048xf16, #blocked2> -> tensor<1x16x2048xf16, #blocked> loc(#loc) | |
%48 = triton_gpu.convert_layout %46 : tensor<4x1x2048xf16, #blocked3> -> tensor<4x1x2048xf16, #blocked> loc(#loc) | |
%49 = arith.extf %48 : tensor<4x1x2048xf16, #blocked> to tensor<4x1x2048xf32, #blocked> loc(#loc) | |
%50 = tt.broadcast %49 : tensor<4x1x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%51 = arith.extf %47 : tensor<1x16x2048xf16, #blocked> to tensor<1x16x2048xf32, #blocked> loc(#loc) | |
%52 = tt.broadcast %51 : tensor<1x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%53 = arith.mulf %50, %52 : tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%54 = "tt.reduce"(%53) <{axis = 2 : i32}> ({ | |
^bb0(%arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): | |
%58 = arith.addf %arg7, %arg8 : f32 loc(#loc) | |
tt.reduce.return %58 : f32 loc(#loc) | |
}) {preserve_layout} : (tensor<4x16x2048xf32, #blocked>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%55 = arith.addf %arg4, %54 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%56 = tt.addptr %arg5, %cst_1 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc) | |
%57 = tt.addptr %arg6, %cst_3 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc) | |
scf.yield %55, %56, %57 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
} loc(#loc) | |
%36 = arith.muli %17, %cst_2 : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%37 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%38 = tt.addptr %37, %36 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%39 = tt.expand_dims %15 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%40 = tt.broadcast %38 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%41 = tt.broadcast %39 : tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%42 = tt.addptr %40, %41 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%43 = arith.truncf %35#0 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> to tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
tt.store %42, %43 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
tt.return loc(#loc) | |
} loc(#loc) | |
} loc(#loc) | |
// -----// IR Dump Before Canonicalizer (canonicalize) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [1, 16, 1], order = [2, 1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [4, 4], order = [1, 0]}> | |
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 64], warpsPerCTA = [16, 1], order = [1, 0]}> | |
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1, 8], threadsPerWarp = [1, 1, 64], warpsPerCTA = [4, 1, 4], order = [2, 1, 0]}> | |
#loc = loc(unknown) | |
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { | |
tt.func public @matvec(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown), %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc) | |
%c1_i32 = arith.constant 1 : i32 loc(#loc) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc) | |
%c4_i32 = arith.constant 4 : i32 loc(#loc) | |
%c16_i32 = arith.constant 16 : i32 loc(#loc) | |
%cst_0 = arith.constant dense<4096> : tensor<4x1xi32, #blocked1> loc(#loc) | |
%cst_1 = arith.constant dense<2048> : tensor<4x2048xi32, #blocked1> loc(#loc) | |
%cst_2 = arith.constant dense<1024> : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%cst_3 = arith.constant dense<2048> : tensor<16x2048xi32, #blocked2> loc(#loc) | |
%cst_4 = arith.constant dense<4096> : tensor<16x1xi32, #blocked2> loc(#loc) | |
%0 = tt.get_program_id x : i32 loc(#loc) | |
%1 = tt.get_program_id y : i32 loc(#loc) | |
%2 = arith.muli %0, %c4_i32 : i32 loc(#loc) | |
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%4 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%5 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%6 = tt.splat %2 : i32 -> tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%7 = arith.addi %5, %3 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> loc(#loc) | |
%8 = arith.addi %6, %4 : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%9 = arith.muli %1, %c16_i32 : i32 loc(#loc) | |
%10 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%12 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%13 = tt.splat %9 : i32 -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%14 = arith.addi %12, %10 : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> loc(#loc) | |
%15 = arith.addi %13, %11 : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> loc(#loc) | |
%16 = tt.expand_dims %7 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc) | |
%17 = tt.expand_dims %8 {axis = 1 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%18 = arith.muli %16, %cst_0 : tensor<4x1xi32, #blocked1> loc(#loc) | |
%19 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%20 = tt.addptr %19, %18 : tensor<4x1x!tt.ptr<f16>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc) | |
%21 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc) | |
%22 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> loc(#loc) | |
%23 = tt.expand_dims %21 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x2048xi32, #blocked2> loc(#loc) | |
%24 = tt.expand_dims %22 {axis = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc) | |
%25 = tt.broadcast %20 : tensor<4x1x!tt.ptr<f16>, #blocked1> -> tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%26 = tt.broadcast %24 : tensor<1x2048xi32, #blocked1> -> tensor<4x2048xi32, #blocked1> loc(#loc) | |
%27 = tt.addptr %25, %26 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc) | |
%28 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1xi32, #blocked2> loc(#loc) | |
%29 = arith.muli %28, %cst_4 : tensor<16x1xi32, #blocked2> loc(#loc) | |
%30 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<16x1x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%31 = tt.addptr %30, %29 : tensor<16x1x!tt.ptr<f16>, #blocked2>, tensor<16x1xi32, #blocked2> loc(#loc) | |
%32 = tt.broadcast %31 : tensor<16x1x!tt.ptr<f16>, #blocked2> -> tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%33 = tt.broadcast %23 : tensor<1x2048xi32, #blocked2> -> tensor<16x2048xi32, #blocked2> loc(#loc) | |
%34 = tt.addptr %32, %33 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc) | |
%c-1_i32 = arith.constant -1 : i32 loc(#loc) | |
%c0_i32_5 = arith.constant 0 : i32 loc(#loc) | |
%c1_i32_6 = arith.constant 1 : i32 loc(#loc) | |
%c1_i32_7 = arith.constant 1 : i32 loc(#loc) | |
%c0_i32_8 = arith.constant 0 : i32 loc(#loc) | |
%35 = arith.muli %c1_i32, %c0_i32_8 : i32 loc(#loc) | |
%36 = arith.addi %c0_i32, %35 : i32 loc(#loc) | |
%37 = arith.cmpi slt, %36, %c2_i32 : i32 loc(#loc) | |
%c0_i32_9 = arith.constant 0 : i32 loc(#loc) | |
%38 = arith.muli %c1_i32, %c0_i32_9 : i32 loc(#loc) | |
%39 = arith.addi %c0_i32, %38 : i32 loc(#loc) | |
%40 = tt.splat %37 : i1 -> tensor<4x2048xi1, #blocked1> loc(#loc) | |
%41 = tt.load %27, %40 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%42 = tt.splat %37 : i1 -> tensor<16x2048xi1, #blocked2> loc(#loc) | |
%43 = tt.load %34, %42 : tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
%44:6 = scf.for %arg3 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %27, %arg6 = %34, %arg7 = %c-1_i32, %arg8 = %41, %arg9 = %43) -> (tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2>, i32, tensor<4x2048xf16, #blocked1>, tensor<16x2048xf16, #blocked2>) : i32 { | |
%c1_i32_10 = arith.constant 1 : i32 loc(#loc) | |
%53 = arith.muli %c1_i32, %c1_i32_10 : i32 loc(#loc) | |
%54 = arith.subi %c2_i32, %53 : i32 loc(#loc) | |
%55 = arith.cmpi slt, %arg3, %54 : i32 loc(#loc) | |
%56 = arith.addi %arg7, %c1_i32_6 : i32 loc(#loc) | |
%57 = arith.cmpi slt, %56, %c1_i32_7 : i32 loc(#loc) | |
%58 = arith.select %57, %56, %c0_i32_5 : i32 loc(#loc) | |
%59 = tt.reshape %arg8 {allow_reorder = false} : tensor<4x2048xf16, #blocked1> -> tensor<4x1x2048xf16, #blocked3> loc(#loc) | |
%60 = tt.reshape %arg9 {allow_reorder = false} : tensor<16x2048xf16, #blocked2> -> tensor<1x16x2048xf16, #blocked> loc(#loc) | |
%61 = triton_gpu.convert_layout %59 : tensor<4x1x2048xf16, #blocked3> -> tensor<4x1x2048xf16, #blocked> loc(#loc) | |
%62 = arith.extf %61 : tensor<4x1x2048xf16, #blocked> to tensor<4x1x2048xf32, #blocked> loc(#loc) | |
%63 = tt.broadcast %62 : tensor<4x1x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%64 = arith.extf %60 : tensor<1x16x2048xf16, #blocked> to tensor<1x16x2048xf32, #blocked> loc(#loc) | |
%65 = tt.broadcast %64 : tensor<1x16x2048xf32, #blocked> -> tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%66 = arith.mulf %63, %65 : tensor<4x16x2048xf32, #blocked> loc(#loc) | |
%67 = "tt.reduce"(%66) <{axis = 2 : i32}> ({ | |
^bb0(%arg10: f32 loc(unknown), %arg11: f32 loc(unknown)): | |
%75 = arith.addf %arg10, %arg11 : f32 loc(#loc) | |
tt.reduce.return %75 : f32 loc(#loc) | |
}) {preserve_layout} : (tensor<4x16x2048xf32, #blocked>) -> tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%68 = arith.addf %arg4, %67 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%69 = tt.addptr %arg5, %cst_1 : tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<4x2048xi32, #blocked1> loc(#loc) | |
%70 = tt.addptr %arg6, %cst_3 : tensor<16x2048x!tt.ptr<f16>, #blocked2>, tensor<16x2048xi32, #blocked2> loc(#loc) | |
%71 = tt.splat %55 : i1 -> tensor<4x2048xi1, #blocked1> loc(#loc) | |
%72 = tt.load %69, %71 : tensor<4x2048x!tt.ptr<f16>, #blocked1> loc(#loc) | |
%73 = tt.splat %55 : i1 -> tensor<16x2048xi1, #blocked2> loc(#loc) | |
%74 = tt.load %70, %73 : tensor<16x2048x!tt.ptr<f16>, #blocked2> loc(#loc) | |
scf.yield %68, %69, %70, %58, %72, %74 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x2048x!tt.ptr<f16>, #blocked1>, tensor<16x2048x!tt.ptr<f16>, #blocked2>, i32, tensor<4x2048xf16, #blocked1>, tensor<16x2048xf16, #blocked2> loc(#loc) | |
} loc(#loc) | |
%45 = arith.muli %17, %cst_2 : tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%46 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%47 = tt.addptr %46, %45 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x1xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%48 = tt.expand_dims %15 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #triton_gpu.slice<{dim = 2, parent = #blocked}>}>> -> tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%49 = tt.broadcast %47 : tensor<4x1x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%50 = tt.broadcast %48 : tensor<1x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%51 = tt.addptr %49, %50 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>>, tensor<4x16xi32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
%52 = arith.truncf %44#0 : tensor<4x16xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> to tensor<4x16xf16, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
tt.store %51, %52 : tensor<4x16x!tt.ptr<f16>, #triton_gpu.slice<{dim = 2, parent = #blocked}>> loc(#loc) | |
tt.return loc(#loc) | |
} loc(#loc) | |
} loc(#loc) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.text | |
.amdgcn_target "amdgcn-amd-amdhsa--gfx942" | |
.amdhsa_code_object_version 4 | |
.globl matvec | |
.p2align 8 | |
.type matvec,@function | |
matvec: | |
s_load_dwordx4 s[8:11], s[0:1], 0x0 | |
s_load_dwordx2 s[12:13], s[0:1], 0x10 | |
v_lshrrev_b32_e32 v1, 6, v0 | |
v_lshl_or_b32 v40, s3, 4, v1 | |
v_lshlrev_b32_e32 v4, 12, v40 | |
v_lshlrev_b32_e32 v1, 3, v0 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v2, s10 | |
v_mov_b32_e32 v3, s11 | |
v_ashrrev_i32_e32 v5, 31, v4 | |
v_and_b32_e32 v6, 0x1f8, v1 | |
v_lshl_add_u64 v[2:3], v[4:5], 1, v[2:3] | |
v_lshlrev_b32_e32 v42, 1, v6 | |
v_mov_b32_e32 v43, 0 | |
v_lshlrev_b32_e32 v7, 4, v0 | |
v_lshl_add_u64 v[44:45], v[2:3], 0, v[42:43] | |
v_and_b32_e32 v2, 0x3000, v7 | |
v_lshl_or_b32 v2, s2, 14, v2 | |
v_ashrrev_i32_e32 v3, 31, v2 | |
v_lshlrev_b64 v[2:3], 1, v[2:3] | |
global_load_dwordx4 v[32:35], v[44:45], off | |
global_load_dwordx4 v[28:31], v[44:45], off offset:1024 | |
global_load_dwordx4 v[24:27], v[44:45], off offset:2048 | |
global_load_dwordx4 v[16:19], v[44:45], off offset:3072 | |
v_lshl_add_u64 v[4:5], s[8:9], 0, v[2:3] | |
v_and_b32_e32 v42, 0xff0, v7 | |
v_lshl_add_u64 v[4:5], v[4:5], 0, v[42:43] | |
global_load_dwordx4 v[36:39], v[4:5], off | |
v_lshrrev_b32_e32 v4, 5, v0 | |
v_and_b32_e32 v4, 24, v4 | |
v_add_u32_e32 v1, v4, v1 | |
v_lshl_add_u32 v41, v1, 2, 0 | |
v_lshlrev_b32_e32 v1, 2, v0 | |
v_and_b32_e32 v0, 0xff, v0 | |
v_lshl_or_b32 v2, v0, 4, v2 | |
v_xor_b32_e32 v51, 0x80, v1 | |
v_lshl_add_u64 v[0:1], v[2:3], 0, s[8:9] | |
s_mov_b64 s[0:1], 0x1000 | |
s_mov_b32 s4, 0 | |
v_lshl_add_u32 v50, v6, 2, 0 | |
v_lshl_add_u64 v[48:49], v[0:1], 0, s[0:1] | |
s_mov_b64 s[14:15], -1 | |
s_mov_b64 s[8:9], 0 | |
v_mov_b32_e32 v42, v43 | |
v_mov_b32_e32 v46, v43 | |
v_mov_b32_e32 v47, v43 | |
s_branch .LBB0_3 | |
.LBB0_1: | |
v_lshl_add_u64 v[20:21], v[48:49], 0, s[8:9] | |
global_load_dwordx4 v[20:23], v[20:21], off | |
.LBB0_2: | |
s_waitcnt vmcnt(0) | |
v_cvt_f32_f16_sdwa v59, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 | |
v_cvt_f32_f16_sdwa v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 | |
v_cvt_f32_f16_e32 v58, v39 | |
v_cvt_f32_f16_e32 v56, v38 | |
v_cvt_f32_f16_sdwa v55, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 | |
v_cvt_f32_f16_sdwa v53, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 | |
v_cvt_f32_f16_e32 v54, v37 | |
v_cvt_f32_f16_e32 v52, v36 | |
s_barrier | |
ds_write_b128 v41, v[56:59] offset:16 | |
ds_write_b128 v41, v[52:55] | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
ds_read_b128 v[36:39], v50 | |
ds_read_b128 v[52:55], v50 offset:16 | |
v_cvt_f32_f16_sdwa v56, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 | |
s_add_u32 s8, s8, 0x1000 | |
s_addc_u32 s9, s9, 0 | |
s_mov_b64 s[14:15], 0 | |
s_waitcnt lgkmcnt(1) | |
v_mul_f32_e32 v37, v37, v56 | |
v_fma_mix_f32 v36, v36, v32, v37 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v38, v33, v36 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v57, v39, v33, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[36:39], v50 offset:8224 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v52, v52, v34, v57 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v52, v53, v34, v52 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v52, v54, v35, v52 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v52, v55, v35, v52 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v37, v37, v56 | |
v_fma_mix_f32 v36, v36, v32, v37 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v38, v33, v36 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v53, v39, v33, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[36:39], v50 offset:8240 | |
v_cvt_f32_f16_e32 v57, v18 | |
s_andn2_b64 vcc, exec, s[10:11] | |
s_waitcnt lgkmcnt(0) | |
v_fma_mix_f32 v36, v36, v34, v53 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v37, v34, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v38, v35, v36 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v53, v39, v35, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[36:39], v50 offset:16448 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v37, v37, v56 | |
v_fma_mix_f32 v36, v36, v32, v37 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v38, v33, v36 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v54, v39, v33, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[36:39], v50 offset:16464 | |
s_waitcnt lgkmcnt(0) | |
v_fma_mix_f32 v36, v36, v34, v54 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v37, v34, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v38, v35, v36 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v54, v39, v35, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[36:39], v50 offset:24672 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v37, v37, v56 | |
v_fma_mix_f32 v32, v36, v32, v37 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v38, v33, v32 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v39, v33, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[36:39], v50 offset:24688 | |
v_cvt_f32_f16_sdwa v56, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 | |
s_waitcnt lgkmcnt(0) | |
v_fma_mix_f32 v32, v36, v34, v32 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v37, v34, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v38, v35, v32 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v55, v39, v35, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[32:35], v50 offset:2048 | |
ds_read_b128 v[36:39], v50 offset:2064 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v32, v32, v28, v52 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v33, v28, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v34, v29, v32 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v52, v35, v29, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[32:35], v50 offset:4096 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v36, v36, v30, v52 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v37, v30, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v38, v31, v36 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v52, v39, v31, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[36:39], v50 offset:4112 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v32, v32, v24, v52 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v33, v24, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v34, v25, v32 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v35, v25, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
s_waitcnt lgkmcnt(0) | |
v_fma_mix_f32 v36, v36, v26, v32 op_sel_hi:[0,1,0] | |
ds_read_b128 v[32:35], v50 offset:10272 | |
v_fma_mix_f32 v36, v37, v26, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v38, v27, v36 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v52, v39, v27, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[36:39], v50 offset:10288 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v32, v32, v28, v53 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v33, v28, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v34, v29, v32 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v35, v29, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
s_waitcnt lgkmcnt(0) | |
v_fma_mix_f32 v36, v36, v30, v32 op_sel_hi:[0,1,0] | |
ds_read_b128 v[32:35], v50 offset:18496 | |
v_fma_mix_f32 v36, v37, v30, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v38, v31, v36 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v53, v39, v31, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[36:39], v50 offset:18512 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v32, v32, v28, v54 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v33, v28, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v34, v29, v32 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v35, v29, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
s_waitcnt lgkmcnt(0) | |
v_fma_mix_f32 v36, v36, v30, v32 op_sel_hi:[0,1,0] | |
ds_read_b128 v[32:35], v50 offset:26720 | |
v_fma_mix_f32 v36, v37, v30, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v38, v31, v36 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v54, v39, v31, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[36:39], v50 offset:26736 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v32, v32, v28, v55 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v28, v33, v28, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v28, v34, v29, v28 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v28, v35, v29, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[32:35], v50 offset:12320 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v28, v36, v30, v28 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v28, v37, v30, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v28, v38, v31, v28 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v55, v39, v31, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[28:31], v50 offset:12336 | |
ds_read_b128 v[36:39], v50 offset:6144 | |
s_waitcnt lgkmcnt(2) | |
v_fma_mix_f32 v32, v32, v24, v53 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v33, v24, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v32, v34, v25, v32 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v53, v35, v25, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[32:35], v50 offset:6160 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v36, v36, v16, v52 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v37, v16, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v38, v17, v36 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v52, v39, v17, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[36:39], v50 offset:20544 | |
v_fma_mix_f32 v28, v28, v26, v53 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v28, v29, v26, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v28, v30, v27, v28 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v53, v31, v27, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[28:31], v50 offset:20560 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v36, v36, v24, v54 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v37, v24, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v38, v25, v36 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v54, v39, v25, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[36:39], v50 offset:28768 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v28, v28, v26, v54 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v28, v29, v26, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v28, v30, v27, v28 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v54, v31, v27, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[28:31], v50 offset:28784 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v36, v36, v24, v55 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v24, v37, v24, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v24, v38, v25, v24 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v24, v39, v25, v24 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[36:39], v50 offset:14368 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v24, v28, v26, v24 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v24, v29, v26, v24 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v24, v30, v27, v24 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v55, v31, v27, v24 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[24:27], v50 offset:14384 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v36, v36, v16, v53 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v37, v16, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v36, v38, v17, v36 op_sel_hi:[0,1,0] | |
ds_read_b128 v[28:31], v50 offset:22592 | |
v_fma_mix_f32 v53, v39, v17, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
ds_read_b128 v[36:39], v50 offset:30816 | |
s_waitcnt lgkmcnt(1) | |
v_fma_mix_f32 v28, v28, v16, v54 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v28, v29, v16, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
s_waitcnt lgkmcnt(0) | |
v_fma_mix_f32 v36, v36, v16, v55 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v16, v37, v16, v36 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v28, v30, v17, v28 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v16, v38, v17, v16 op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v54, v31, v17, v28 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_fma_mix_f32 v55, v39, v17, v16 op_sel:[0,1,0] op_sel_hi:[0,1,0] | |
v_mul_f32_e32 v16, v32, v57 | |
v_mul_f32_e32 v17, v24, v57 | |
ds_read_b128 v[28:31], v50 offset:22608 | |
v_cvt_f32_f16_sdwa v36, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 | |
v_cvt_f32_f16_e32 v38, v19 | |
v_pk_add_f32 v[52:53], v[16:17], v[52:53] | |
ds_read_b128 v[16:19], v50 offset:30832 | |
s_waitcnt lgkmcnt(1) | |
v_mul_f32_e32 v58, v28, v57 | |
v_mov_b32_e32 v24, v33 | |
v_pk_fma_f32 v[24:25], v[24:25], v[36:37], v[52:53] op_sel_hi:[1,0,1] | |
v_mov_b32_e32 v28, v34 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v59, v16, v57 | |
v_pk_add_f32 v[54:55], v[58:59], v[54:55] | |
v_mov_b32_e32 v16, v29 | |
v_mov_b32_e32 v29, v26 | |
v_pk_fma_f32 v[16:17], v[16:17], v[36:37], v[54:55] op_sel_hi:[1,0,1] | |
v_pk_fma_f32 v[24:25], v[28:29], v[38:39], v[24:25] op_sel_hi:[1,0,1] | |
v_mov_b32_e32 v28, v30 | |
v_mov_b32_e32 v29, v18 | |
v_pk_fma_f32 v[16:17], v[28:29], v[38:39], v[16:17] op_sel_hi:[1,0,1] | |
v_mov_b32_e32 v18, v31 | |
v_mov_b32_e32 v26, v35 | |
v_pk_fma_f32 v[16:17], v[18:19], v[56:57], v[16:17] op_sel_hi:[1,0,1] | |
v_pk_fma_f32 v[18:19], v[26:27], v[56:57], v[24:25] op_sel_hi:[1,0,1] | |
ds_bpermute_b32 v24, v51, v18 | |
ds_bpermute_b32 v26, v51, v16 | |
ds_bpermute_b32 v27, v51, v17 | |
ds_bpermute_b32 v25, v51, v19 | |
v_mov_b64_e32 v[38:39], v[22:23] | |
v_mov_b64_e32 v[34:35], v[2:3] | |
v_mov_b64_e32 v[30:31], v[6:7] | |
s_waitcnt lgkmcnt(1) | |
v_pk_add_f32 v[16:17], v[16:17], v[26:27] | |
s_waitcnt lgkmcnt(0) | |
v_pk_add_f32 v[18:19], v[18:19], v[24:25] | |
ds_swizzle_b32 v24, v18 offset:swizzle(SWAP,16) | |
ds_swizzle_b32 v26, v16 offset:swizzle(SWAP,16) | |
ds_swizzle_b32 v27, v17 offset:swizzle(SWAP,16) | |
ds_swizzle_b32 v25, v19 offset:swizzle(SWAP,16) | |
v_mov_b64_e32 v[36:37], v[20:21] | |
v_mov_b64_e32 v[32:33], v[0:1] | |
v_mov_b64_e32 v[28:29], v[4:5] | |
s_waitcnt lgkmcnt(1) | |
v_pk_add_f32 v[16:17], v[16:17], v[26:27] | |
s_waitcnt lgkmcnt(0) | |
v_pk_add_f32 v[18:19], v[18:19], v[24:25] | |
ds_swizzle_b32 v24, v18 offset:swizzle(SWAP,8) | |
ds_swizzle_b32 v26, v16 offset:swizzle(SWAP,8) | |
ds_swizzle_b32 v27, v17 offset:swizzle(SWAP,8) | |
ds_swizzle_b32 v25, v19 offset:swizzle(SWAP,8) | |
s_waitcnt lgkmcnt(1) | |
v_pk_add_f32 v[16:17], v[16:17], v[26:27] | |
s_waitcnt lgkmcnt(0) | |
v_pk_add_f32 v[18:19], v[18:19], v[24:25] | |
ds_swizzle_b32 v24, v18 offset:swizzle(SWAP,4) | |
ds_swizzle_b32 v26, v16 offset:swizzle(SWAP,4) | |
ds_swizzle_b32 v27, v17 offset:swizzle(SWAP,4) | |
ds_swizzle_b32 v25, v19 offset:swizzle(SWAP,4) | |
s_waitcnt lgkmcnt(1) | |
v_pk_add_f32 v[16:17], v[16:17], v[26:27] | |
s_waitcnt lgkmcnt(0) | |
v_pk_add_f32 v[18:19], v[18:19], v[24:25] | |
ds_swizzle_b32 v24, v18 offset:swizzle(SWAP,2) | |
ds_swizzle_b32 v26, v16 offset:swizzle(SWAP,2) | |
ds_swizzle_b32 v27, v17 offset:swizzle(SWAP,2) | |
ds_swizzle_b32 v25, v19 offset:swizzle(SWAP,2) | |
s_waitcnt lgkmcnt(1) | |
v_pk_add_f32 v[16:17], v[16:17], v[26:27] | |
s_waitcnt lgkmcnt(0) | |
v_pk_add_f32 v[18:19], v[18:19], v[24:25] | |
ds_swizzle_b32 v24, v18 offset:swizzle(SWAP,1) | |
ds_swizzle_b32 v25, v19 offset:swizzle(SWAP,1) | |
ds_swizzle_b32 v26, v16 offset:swizzle(SWAP,1) | |
ds_swizzle_b32 v27, v17 offset:swizzle(SWAP,1) | |
s_waitcnt lgkmcnt(2) | |
v_pk_add_f32 v[18:19], v[18:19], v[24:25] | |
s_nop 0 | |
v_pk_add_f32 v[42:43], v[42:43], v[18:19] | |
s_waitcnt lgkmcnt(0) | |
v_pk_add_f32 v[16:17], v[16:17], v[26:27] | |
v_mov_b64_e32 v[26:27], v[10:11] | |
v_pk_add_f32 v[46:47], v[46:47], v[16:17] | |
v_mov_b64_e32 v[18:19], v[14:15] | |
v_mov_b64_e32 v[24:25], v[8:9] | |
v_mov_b64_e32 v[16:17], v[12:13] | |
s_cbranch_vccz .LBB0_14 | |
.LBB0_3: | |
s_xor_b64 s[10:11], s[14:15], -1 | |
v_lshl_add_u64 v[12:13], v[44:45], 0, s[8:9] | |
s_mov_b32 s5, s4 | |
s_mov_b32 s6, s4 | |
s_mov_b32 s7, s4 | |
s_and_b64 vcc, exec, s[10:11] | |
s_cbranch_vccnz .LBB0_6 | |
v_add_co_u32_e32 v0, vcc, 0x1000, v12 | |
s_nop 1 | |
v_addc_co_u32_e32 v1, vcc, 0, v13, vcc | |
global_load_dwordx4 v[0:3], v[0:1], off | |
v_cndmask_b32_e64 v4, 0, 1, s[14:15] | |
v_cmp_ne_u32_e64 s[0:1], 1, v4 | |
s_andn2_b64 vcc, exec, s[14:15] | |
s_cbranch_vccz .LBB0_7 | |
.LBB0_5: | |
v_mov_b64_e32 v[4:5], s[4:5] | |
v_mov_b64_e32 v[6:7], s[6:7] | |
s_branch .LBB0_8 | |
.LBB0_6: | |
v_mov_b64_e32 v[0:1], s[4:5] | |
v_mov_b64_e32 v[2:3], s[6:7] | |
v_cndmask_b32_e64 v4, 0, 1, s[14:15] | |
v_cmp_ne_u32_e64 s[0:1], 1, v4 | |
s_andn2_b64 vcc, exec, s[14:15] | |
s_cbranch_vccnz .LBB0_5 | |
.LBB0_7: | |
v_add_co_u32_e32 v4, vcc, 0x1000, v12 | |
s_nop 1 | |
v_addc_co_u32_e32 v5, vcc, 0, v13, vcc | |
global_load_dwordx4 v[4:7], v[4:5], off offset:1024 | |
.LBB0_8: | |
s_mov_b32 s5, s4 | |
s_mov_b32 s6, s4 | |
s_and_b64 vcc, exec, s[0:1] | |
s_mov_b32 s7, s4 | |
s_cbranch_vccnz .LBB0_11 | |
v_add_co_u32_e32 v8, vcc, 0x1000, v12 | |
s_nop 1 | |
v_addc_co_u32_e32 v9, vcc, 0, v13, vcc | |
global_load_dwordx4 v[8:11], v[8:9], off offset:2048 | |
s_and_b64 vcc, exec, s[0:1] | |
s_cbranch_vccz .LBB0_12 | |
.LBB0_10: | |
v_mov_b64_e32 v[14:15], s[6:7] | |
v_mov_b64_e32 v[12:13], s[4:5] | |
s_and_b64 vcc, exec, s[0:1] | |
s_cbranch_vccz .LBB0_1 | |
s_branch .LBB0_13 | |
.LBB0_11: | |
v_mov_b64_e32 v[10:11], s[6:7] | |
v_mov_b64_e32 v[8:9], s[4:5] | |
s_and_b64 vcc, exec, s[0:1] | |
s_cbranch_vccnz .LBB0_10 | |
.LBB0_12: | |
v_add_co_u32_e32 v12, vcc, 0x1000, v12 | |
s_nop 1 | |
v_addc_co_u32_e32 v13, vcc, 0, v13, vcc | |
global_load_dwordx4 v[12:15], v[12:13], off offset:3072 | |
s_and_b64 vcc, exec, s[0:1] | |
s_cbranch_vccz .LBB0_1 | |
.LBB0_13: | |
s_mov_b32 s6, s4 | |
s_mov_b32 s7, s4 | |
s_mov_b32 s5, s4 | |
v_mov_b64_e32 v[22:23], s[6:7] | |
v_mov_b64_e32 v[20:21], s[4:5] | |
s_branch .LBB0_2 | |
.LBB0_14: | |
s_lshl_b32 s0, s2, 12 | |
s_ashr_i32 s1, s0, 31 | |
s_or_b32 s2, s0, 0x800 | |
s_or_b32 s4, s0, 0xc00 | |
s_lshl_b64 s[0:1], s[0:1], 1 | |
s_add_u32 s0, s12, s0 | |
s_addc_u32 s1, s13, s1 | |
s_ashr_i32 s3, s2, 31 | |
s_lshl_b64 s[2:3], s[2:3], 1 | |
s_add_u32 s2, s12, s2 | |
s_addc_u32 s3, s13, s3 | |
s_ashr_i32 s5, s4, 31 | |
v_cvt_f16_f32_e32 v6, v42 | |
s_lshl_b64 s[4:5], s[4:5], 1 | |
v_ashrrev_i32_e32 v41, 31, v40 | |
v_cvt_f16_f32_e32 v7, v43 | |
s_add_u32 s4, s12, s4 | |
v_lshlrev_b64 v[0:1], 1, v[40:41] | |
v_cvt_f16_f32_e32 v8, v46 | |
s_addc_u32 s5, s13, s5 | |
v_lshl_add_u64 v[2:3], s[0:1], 0, v[0:1] | |
v_cvt_f16_f32_e32 v9, v47 | |
v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] | |
v_lshl_add_u64 v[0:1], s[4:5], 0, v[0:1] | |
global_store_short v[2:3], v6, off | |
global_store_short v[2:3], v7, off offset:2048 | |
global_store_short v[4:5], v8, off | |
global_store_short v[0:1], v9, off | |
s_endpgm | |
.section .rodata,"a",@progbits | |
.p2align 6, 0x0 | |
.amdhsa_kernel matvec | |
.amdhsa_group_segment_fixed_size 0 | |
.amdhsa_private_segment_fixed_size 0 | |
.amdhsa_kernarg_size 24 | |
.amdhsa_user_sgpr_count 2 | |
.amdhsa_user_sgpr_dispatch_ptr 0 | |
.amdhsa_user_sgpr_queue_ptr 0 | |
.amdhsa_user_sgpr_kernarg_segment_ptr 1 | |
.amdhsa_user_sgpr_dispatch_id 0 | |
.amdhsa_user_sgpr_kernarg_preload_length 0 | |
.amdhsa_user_sgpr_kernarg_preload_offset 0 | |
.amdhsa_user_sgpr_private_segment_size 0 | |
.amdhsa_enable_private_segment 0 | |
.amdhsa_system_sgpr_workgroup_id_x 1 | |
.amdhsa_system_sgpr_workgroup_id_y 1 | |
.amdhsa_system_sgpr_workgroup_id_z 0 | |
.amdhsa_system_sgpr_workgroup_info 0 | |
.amdhsa_system_vgpr_workitem_id 0 | |
.amdhsa_next_free_vgpr 60 | |
.amdhsa_next_free_sgpr 16 | |
.amdhsa_accum_offset 60 | |
.amdhsa_reserve_vcc 1 | |
.amdhsa_reserve_xnack_mask 1 | |
.amdhsa_float_round_mode_32 0 | |
.amdhsa_float_round_mode_16_64 0 | |
.amdhsa_float_denorm_mode_32 3 | |
.amdhsa_float_denorm_mode_16_64 3 | |
.amdhsa_dx10_clamp 1 | |
.amdhsa_ieee_mode 1 | |
.amdhsa_fp16_overflow 0 | |
.amdhsa_tg_split 0 | |
.amdhsa_exception_fp_ieee_invalid_op 0 | |
.amdhsa_exception_fp_denorm_src 0 | |
.amdhsa_exception_fp_ieee_div_zero 0 | |
.amdhsa_exception_fp_ieee_overflow 0 | |
.amdhsa_exception_fp_ieee_underflow 0 | |
.amdhsa_exception_fp_ieee_inexact 0 | |
.amdhsa_exception_int_div_zero 0 | |
.end_amdhsa_kernel | |
.text | |
.Lfunc_end0: | |
.size matvec, .Lfunc_end0-matvec | |
.p2alignl 6, 3212836864 | |
.fill 256, 4, 3212836864 | |
.section ".note.GNU-stack","",@progbits | |
.amdgpu_metadata | |
--- | |
amdhsa.kernels: | |
- .agpr_count: 0 | |
.args: | |
- .address_space: global | |
.offset: 0 | |
.size: 8 | |
.value_kind: global_buffer | |
- .address_space: global | |
.offset: 8 | |
.size: 8 | |
.value_kind: global_buffer | |
- .address_space: global | |
.offset: 16 | |
.size: 8 | |
.value_kind: global_buffer | |
.group_segment_fixed_size: 0 | |
.kernarg_segment_align: 8 | |
.kernarg_segment_size: 24 | |
.max_flat_workgroup_size: 1024 | |
.name: matvec | |
.private_segment_fixed_size: 0 | |
.sgpr_count: 22 | |
.sgpr_spill_count: 0 | |
.symbol: matvec.kd | |
.vgpr_count: 60 | |
.vgpr_spill_count: 0 | |
.wavefront_size: 64 | |
amdhsa.target: amdgcn-amd-amdhsa--gfx942 | |
amdhsa.version: | |
- 1 | |
- 1 | |
... | |
.end_amdgpu_metadata | |
Traceback (most recent call last): | |
File "/data/reduction/matvec.py", line 57, in <module> | |
triton.testing.assert_close(c, ref) | |
File "/data/triton/.venv/lib/python3.11/site-packages/triton/testing.py", line 212, in assert_close | |
np.testing.assert_allclose(x, y, atol=atol, rtol=rtol, equal_nan=True) | |
File "/data/triton/.venv/lib/python3.11/site-packages/numpy/testing/_private/utils.py", line 1504, in assert_allclose | |
assert_array_compare(compare, actual, desired, err_msg=str(err_msg), | |
File "/home/mirror/.pyenv/versions/3.11.5/lib/python3.11/contextlib.py", line 81, in inner | |
return func(*args, **kwds) | |
^^^^^^^^^^^^^^^^^^^ | |
File "/data/triton/.venv/lib/python3.11/site-packages/numpy/testing/_private/utils.py", line 797, in assert_array_compare | |
raise AssertionError(msg) | |
AssertionError: | |
Not equal to tolerance rtol=0, atol=0.01 | |
Mismatched elements: 3 / 8192 (0.0366%) | |
Max absolute difference: 0.03125 | |
Max relative difference: 0.002094 | |
x: array([[ 109.44 , -112.3 , 22.38 , ..., 24.73 , 172.4 , -68.25 ], | |
[ 43.1 , -72.7 , 28.06 , ..., 40.94 , 107.75 , 40.9 ], | |
[ 19.78 , -20.12 , -28.95 , ..., -77.1 , 92.9 , 68.1 ],... | |
y: array([[ 109.44 , -112.3 , 22.38 , ..., 24.73 , 172.4 , -68.25 ], | |
[ 43.1 , -72.7 , 28.06 , ..., 40.94 , 107.75 , 40.9 ], | |
[ 19.78 , -20.12 , -28.95 , ..., -77.1 , 92.9 , 68.1 ],... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import triton | |
import triton.language as tl | |
@triton.jit | |
def matvec( | |
a_ptr, b_ptr, c_ptr, | |
M: tl.constexpr, N: tl.constexpr, K: tl.constexpr, | |
BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, TILE_SIZE_K: tl.constexpr, | |
): | |
# Assume aligned shapes w.r.t. tile sizes | |
tl.static_assert(M % BLOCK_SIZE_M == 0) | |
tl.static_assert(N % BLOCK_SIZE_N == 0) | |
tl.static_assert(K % TILE_SIZE_K == 0) | |
pid_m = tl.program_id(0) | |
pid_n = tl.program_id(1) | |
offset_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) | |
offset_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) | |
offset_k = tl.arange(0, TILE_SIZE_K) | |
a_ptrs = a_ptr + offset_m[:, None] * K + offset_k[None, :] | |
b_ptrs = b_ptr + offset_n[:, None] * K + offset_k[None, :] | |
acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) | |
for k in range(0, tl.cdiv(K, TILE_SIZE_K)): | |
a = tl.load(a_ptrs) | |
b = tl.load(b_ptrs) | |
a_bc = a.reshape(BLOCK_SIZE_M, 1, TILE_SIZE_K).broadcast_to(BLOCK_SIZE_M, BLOCK_SIZE_N, TILE_SIZE_K) | |
b_bc = b.reshape(1, BLOCK_SIZE_N, TILE_SIZE_K).broadcast_to(BLOCK_SIZE_M, BLOCK_SIZE_N, TILE_SIZE_K) | |
a_f32 = a_bc.to(tl.float32) | |
b_f32 = b_bc.to(tl.float32) | |
acc += tl.sum(a_f32 * b_f32, axis=-1) | |
a_ptrs += TILE_SIZE_K | |
b_ptrs += TILE_SIZE_K | |
c_ptrs = c_ptr + offset_m[:, None] * N + offset_n[None, :] | |
c = acc.to(tl.float16) | |
tl.store(c_ptrs, c) | |
m = 8 | |
n = 1024 | |
k = 4096 | |
a = torch.randn((m, k), device='cuda', dtype=torch.float16) | |
b = torch.randn((n, k), device='cuda', dtype=torch.float16) | |
c = torch.zeros((m, n), device='cuda', dtype=torch.float16) | |
n_warps = 16 | |
block_m = 4 | |
block_n = n_warps | |
tile_k = 2048 | |
grid = (triton.cdiv(m, block_m), triton.cdiv(n, block_n)) | |
matvec[grid](a, b, c, m, n, k, block_m, block_n, tile_k, num_warps=n_warps) | |
ref = torch.matmul(a, torch.transpose(b, 0, 1)) | |
triton.testing.assert_close(c, ref) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment