Created
March 25, 2025 13:01
-
-
Save pashu123/371870f7abde322599950ca1b0842185 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
module { | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
module { | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
module { | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
module { | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
module { | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
module { | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
module { | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
module { | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After Inliner (inline) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
module { | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
module { | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {hal.device.targets = [#device_target_hip]} { | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After PadToIntrinsicsPass (iree-preprocessing-pad-to-intrinsics) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After CloneToConsumersPass (iree-stream-clone-to-consumers) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After Inliner (inline) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After CombineInitializersPass (iree-util-combine-initializers) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After SpecializeEncodingsPass (iree-stream-specialize-encodings) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After SyncInitializersPass (iree-stream-sync-initializers) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After PropagateSubrangesPass (iree-util-propagate-subranges) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchAssumptionsPass (iree-stream-annotate-dispatch-assumptions) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After DumpExecutableSourcesPass (iree-hal-dump-executable-sources) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After MaterializeDeviceEncodingPass (iree-codegen-materialize-device-encoding) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After MaterializeEncodingIntoPaddingPass (iree-codegen-materialize-encoding-into-padding) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After GPUGeneralizeNamedOpsPass (iree-codegen-gpu-generalize-named-ops) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After BlockDynamicDimensionsPass (iree-codegen-block-dynamic-dimensions) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After MaterializeTuningSpecsPass (iree-codegen-materialize-tuning-specs) //----- // | |
module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- // | |
module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
The parallel size is 16384 | |
The lastDimWgpTileSize is 8192 | |
The lastDimWgpTileSize is 8192 | |
The parallel size is 16384 | |
The lastDimWgpTileSize is 8192 | |
The lastDimWgpTileSize is 8192 | |
This op is going through 0 | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
The entry point is ............. | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMGPUSelectLoweringStrategyPass (iree-llvmgpu-select-lowering-strategy) //----- // | |
module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- // | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- // | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
// -----// IR Dump After DumpExecutableSourcesPass (iree-hal-dump-executable-sources) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #translation} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
compiled_punet_main$async_dispatch_14.mlir:0:0: remark: Executable benchmarks were requested but none were generated. Run with --debug-only=iree-dump-executable-benchmarks for more details. | |
// -----// IR Dump After DumpExecutableBenchmarksPass (iree-hal-dump-executable-benchmarks) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @main$async_dispatch_14 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #translation} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After HoistExecutableObjectsPass (iree-hal-hoist-executable-objects) //----- // | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>) { | |
hal.executable.export public @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- // | |
module { | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%14 = arith.extf %in : f16 to f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.addf %in, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%14 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%8, %11 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%9 : tensor<2x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%14 = arith.subf %in, %in_2 : f32 | |
%15 = arith.mulf %14, %14 : f32 | |
%16 = arith.addf %15, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %11, %12 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%6 : tensor<2x32x10x16384xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%14 = arith.divf %in_3, %cst_0 : f32 | |
%15 = arith.addf %14, %cst : f32 | |
%16 = math.rsqrt %15 : f32 | |
%17 = arith.extf %in : f16 to f32 | |
%18 = arith.subf %17, %in_2 : f32 | |
%19 = arith.mulf %18, %16 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%extracted_slice_2 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%8 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2 : tensor<1x1x10x16384xf16>) outs(%8 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%28 = arith.extf %in : f16 to f32 | |
linalg.yield %28 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%10 = tensor.empty() : tensor<1x1xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<1x1x10x16384xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%28 = arith.addf %in, %out : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<1x1xf32> | |
%13 = tensor.empty() : tensor<1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<1x1xf32>) outs(%13 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%28 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<1x1xf32> | |
%extracted_slice_3 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%15 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_3 : tensor<1x1x10x16384xf16>) outs(%15 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%28 = arith.extf %in : f16 to f32 | |
linalg.yield %28 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%extracted_slice_4 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%17 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x10x16384xf16>) outs(%17 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%28 = arith.extf %in : f16 to f32 | |
linalg.yield %28 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%19 = tensor.empty() : tensor<1x1xf32> | |
%20 = linalg.fill ins(%cst_1 : f32) outs(%19 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%18 : tensor<1x1x10x16384xf32>) outs(%20 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%28 = arith.addf %in, %out : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<1x1xf32> | |
%22 = tensor.empty() : tensor<1x1xf32> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%21 : tensor<1x1xf32>) outs(%22 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%28 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<1x1xf32> | |
%24 = tensor.empty() : tensor<1x1xf32> | |
%25 = linalg.fill ins(%cst_1 : f32) outs(%24 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%16, %23 : tensor<1x1x10x16384xf32>, tensor<1x1xf32>) outs(%25 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_6: f32, %out: f32): | |
%28 = arith.subf %in, %in_6 : f32 | |
%29 = arith.mulf %28, %28 : f32 | |
%30 = arith.addf %29, %out : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<1x1xf32> | |
%extracted_slice_5 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %14, %26 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_5 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_6: f32, %in_7: f32, %out: f32): | |
%28 = arith.divf %in_7, %cst_0 : f32 | |
%29 = arith.addf %28, %cst : f32 | |
%30 = math.rsqrt %29 : f32 | |
%31 = arith.extf %in : f16 to f32 | |
%32 = arith.subf %31, %in_6 : f32 | |
%33 = arith.mulf %32, %30 : f32 | |
linalg.yield %33 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %27 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%extracted_slice_2 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%8 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2 : tensor<1x1x10x16384xf16>) outs(%8 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%28 = arith.extf %in : f16 to f32 | |
linalg.yield %28 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%10 = tensor.empty() : tensor<1x1xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<1x1x10x16384xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%28 = arith.addf %in, %out : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<1x1xf32> | |
%13 = tensor.empty() : tensor<1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<1x1xf32>) outs(%13 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%28 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<1x1xf32> | |
%extracted_slice_3 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%15 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_3 : tensor<1x1x10x16384xf16>) outs(%15 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%28 = arith.extf %in : f16 to f32 | |
linalg.yield %28 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%extracted_slice_4 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%17 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x10x16384xf16>) outs(%17 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%28 = arith.extf %in : f16 to f32 | |
linalg.yield %28 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%19 = tensor.empty() : tensor<1x1xf32> | |
%20 = linalg.fill ins(%cst_1 : f32) outs(%19 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%18 : tensor<1x1x10x16384xf32>) outs(%20 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%28 = arith.addf %in, %out : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<1x1xf32> | |
%22 = tensor.empty() : tensor<1x1xf32> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%21 : tensor<1x1xf32>) outs(%22 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%28 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<1x1xf32> | |
%24 = tensor.empty() : tensor<1x1xf32> | |
%25 = linalg.fill ins(%cst_1 : f32) outs(%24 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%16, %23 : tensor<1x1x10x16384xf32>, tensor<1x1xf32>) outs(%25 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_6: f32, %out: f32): | |
%28 = arith.subf %in, %in_6 : f32 | |
%29 = arith.mulf %28, %28 : f32 | |
%30 = arith.addf %29, %out : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<1x1xf32> | |
%extracted_slice_5 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %14, %26 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_5 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_6: f32, %in_7: f32, %out: f32): | |
%28 = arith.divf %in_7, %cst_0 : f32 | |
%29 = arith.addf %28, %cst : f32 | |
%30 = math.rsqrt %29 : f32 | |
%31 = arith.extf %in : f16 to f32 | |
%32 = arith.subf %31, %in_6 : f32 | |
%33 = arith.mulf %32, %30 : f32 | |
linalg.yield %33 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %27 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%extracted_slice_2 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%8 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2 : tensor<1x1x10x16384xf16>) outs(%8 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%16 = arith.extf %in : f16 to f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%10 = tensor.empty() : tensor<1x1xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<1x1x10x16384xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.addf %in, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<1x1xf32>) outs(%10 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %13 : tensor<1x1x10x16384xf32>, tensor<1x1xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%16 = arith.subf %in, %in_4 : f32 | |
%17 = arith.mulf %16, %16 : f32 | |
%18 = arith.addf %17, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<1x1xf32> | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %14 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%16 = arith.divf %in_5, %cst_0 : f32 | |
%17 = arith.addf %16, %cst : f32 | |
%18 = math.rsqrt %17 : f32 | |
%19 = arith.extf %in : f16 to f32 | |
%20 = arith.subf %19, %in_4 : f32 | |
%21 = arith.mulf %20, %18 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After ConvertAttentionToOnlineAttentionPass (iree-linalg-ext-convert-attention-to-online-attention) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%extracted_slice_2 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%8 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2 : tensor<1x1x10x16384xf16>) outs(%8 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%16 = arith.extf %in : f16 to f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%10 = tensor.empty() : tensor<1x1xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<1x1x10x16384xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.addf %in, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<1x1xf32>) outs(%10 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %13 : tensor<1x1x10x16384xf32>, tensor<1x1xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%16 = arith.subf %in, %in_4 : f32 | |
%17 = arith.mulf %16, %16 : f32 | |
%18 = arith.addf %17, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<1x1xf32> | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %14 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%16 = arith.divf %in_5, %cst_0 : f32 | |
%17 = arith.addf %16, %cst : f32 | |
%18 = math.rsqrt %17 : f32 | |
%19 = arith.extf %in : f16 to f32 | |
%20 = arith.subf %19, %in_4 : f32 | |
%21 = arith.mulf %20, %18 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%extracted_slice_2 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%8 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2 : tensor<1x1x10x16384xf16>) outs(%8 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%16 = arith.extf %in : f16 to f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%10 = tensor.empty() : tensor<1x1xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<1x1x10x16384xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.addf %in, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<1x1xf32>) outs(%10 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %13 : tensor<1x1x10x16384xf32>, tensor<1x1xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%16 = arith.subf %in, %in_4 : f32 | |
%17 = arith.mulf %16, %16 : f32 | |
%18 = arith.addf %17, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<1x1xf32> | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %14 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%16 = arith.divf %in_5, %cst_0 : f32 | |
%17 = arith.addf %16, %cst : f32 | |
%18 = math.rsqrt %17 : f32 | |
%19 = arith.extf %in : f16 to f32 | |
%20 = arith.subf %19, %in_4 : f32 | |
%21 = arith.mulf %20, %18 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%extracted_slice_2 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%8 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2 : tensor<1x1x10x16384xf16>) outs(%8 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%16 = arith.extf %in : f16 to f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%10 = tensor.empty() : tensor<1x1xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<1x1x10x16384xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.addf %in, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<1x1xf32>) outs(%10 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %13 : tensor<1x1x10x16384xf32>, tensor<1x1xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%16 = arith.subf %in, %in_4 : f32 | |
%17 = arith.mulf %16, %16 : f32 | |
%18 = arith.addf %17, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<1x1xf32> | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %14 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%16 = arith.divf %in_5, %cst_0 : f32 | |
%17 = arith.addf %16, %cst : f32 | |
%18 = math.rsqrt %17 : f32 | |
%19 = arith.extf %in : f16 to f32 | |
%20 = arith.subf %19, %in_4 : f32 | |
%21 = arith.mulf %20, %18 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After GPUPromoteMatmulOperandsPass (iree-codegen-gpu-promote-matmul-operands) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%extracted_slice_2 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%8 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2 : tensor<1x1x10x16384xf16>) outs(%8 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%16 = arith.extf %in : f16 to f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%10 = tensor.empty() : tensor<1x1xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<1x1x10x16384xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.addf %in, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<1x1xf32>) outs(%10 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %13 : tensor<1x1x10x16384xf32>, tensor<1x1xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%16 = arith.subf %in, %in_4 : f32 | |
%17 = arith.mulf %16, %16 : f32 | |
%18 = arith.addf %17, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<1x1xf32> | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %14 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%16 = arith.divf %in_5, %cst_0 : f32 | |
%17 = arith.addf %16, %cst : f32 | |
%18 = math.rsqrt %17 : f32 | |
%19 = arith.extf %in : f16 to f32 | |
%20 = arith.subf %19, %in_4 : f32 | |
%21 = arith.mulf %20, %18 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%extracted_slice_2 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%8 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2 : tensor<1x1x10x16384xf16>) outs(%8 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%16 = arith.extf %in : f16 to f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%10 = tensor.empty() : tensor<1x1xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<1x1x10x16384xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.addf %in, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<1x1xf32>) outs(%10 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %13 : tensor<1x1x10x16384xf32>, tensor<1x1xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%16 = arith.subf %in, %in_4 : f32 | |
%17 = arith.mulf %16, %16 : f32 | |
%18 = arith.addf %17, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<1x1xf32> | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %14 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%16 = arith.divf %in_5, %cst_0 : f32 | |
%17 = arith.addf %16, %cst : f32 | |
%18 = math.rsqrt %17 : f32 | |
%19 = arith.extf %in : f16 to f32 | |
%20 = arith.subf %19, %in_4 : f32 | |
%21 = arith.mulf %20, %18 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After LoopCoalescing (affine-loop-coalescing) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%extracted_slice_2 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%8 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2 : tensor<1x1x10x16384xf16>) outs(%8 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%16 = arith.extf %in : f16 to f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%10 = tensor.empty() : tensor<1x1xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<1x1x10x16384xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.addf %in, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<1x1xf32>) outs(%10 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %13 : tensor<1x1x10x16384xf32>, tensor<1x1xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%16 = arith.subf %in, %in_4 : f32 | |
%17 = arith.mulf %16, %16 : f32 | |
%18 = arith.addf %17, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<1x1xf32> | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %14 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%16 = arith.divf %in_5, %cst_0 : f32 | |
%17 = arith.addf %16, %cst : f32 | |
%18 = math.rsqrt %17 : f32 | |
%19 = arith.extf %in : f16 to f32 | |
%20 = arith.subf %19, %in_4 : f32 | |
%21 = arith.mulf %20, %18 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%extracted_slice_2 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%8 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2 : tensor<1x1x10x16384xf16>) outs(%8 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%16 = arith.extf %in : f16 to f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%10 = tensor.empty() : tensor<1x1xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<1x1x10x16384xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.addf %in, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<1x1xf32>) outs(%10 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %13 : tensor<1x1x10x16384xf32>, tensor<1x1xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%16 = arith.subf %in, %in_4 : f32 | |
%17 = arith.mulf %16, %16 : f32 | |
%18 = arith.addf %17, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<1x1xf32> | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %14 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%16 = arith.divf %in_5, %cst_0 : f32 | |
%17 = arith.addf %16, %cst : f32 | |
%18 = math.rsqrt %17 : f32 | |
%19 = arith.extf %in : f16 to f32 | |
%20 = arith.subf %19, %in_4 : f32 | |
%21 = arith.mulf %20, %18 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%extracted_slice_2 = tensor.extract_slice %5[%arg0, %arg1, 0, 0] [1, 1, 10, 16384] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x16384xf16> | |
%8 = tensor.empty() : tensor<1x1x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_2 : tensor<1x1x10x16384xf16>) outs(%8 : tensor<1x1x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%16 = arith.extf %in : f16 to f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1x10x16384xf32> | |
%10 = tensor.empty() : tensor<1x1xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<1x1x10x16384xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.addf %in, %out : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<1x1xf32>) outs(%10 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %13 : tensor<1x1x10x16384xf32>, tensor<1x1xf32>) outs(%11 : tensor<1x1xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%16 = arith.subf %in, %in_4 : f32 | |
%17 = arith.mulf %16, %16 : f32 | |
%18 = arith.addf %17, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<1x1xf32> | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %14 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%16 = arith.divf %in_5, %cst_0 : f32 | |
%17 = arith.addf %16, %cst : f32 | |
%18 = math.rsqrt %17 : f32 | |
%19 = arith.extf %in : f16 to f32 | |
%20 = arith.subf %19, %in_4 : f32 | |
%21 = arith.mulf %20, %18 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c8192 = arith.constant 8192 : index | |
%c1 = arith.constant 1 : index | |
%c16384 = arith.constant 16384 : index | |
%c10 = arith.constant 10 : index | |
%c0 = arith.constant 0 : index | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%8 = tensor.empty() : tensor<1x1xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%10 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1x1x8192xf32>) -> tensor<1x1x1x8192xf32> | |
%12 = scf.for %arg4 = %c0 to %c10 step %c1 iter_args(%arg5 = %11) -> (tensor<1x1x1x8192xf32>) { | |
%18 = scf.for %arg6 = %c0 to %c16384 step %c8192 iter_args(%arg7 = %arg5) -> (tensor<1x1x1x8192xf32>) { | |
%extracted_slice_4 = tensor.extract_slice %5[%arg0, %arg1, %arg4, %arg6] [1, 1, 1, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x1x8192xf16> | |
%19 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x1x8192xf16>) outs(%19 : tensor<1x1x1x8192xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%22 = arith.extf %in : f16 to f32 | |
linalg.yield %22 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20 : tensor<1x1x1x8192xf32>) outs(%arg7 : tensor<1x1x1x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%22 = arith.addf %in, %out : f32 | |
linalg.yield %22 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
scf.yield %21 : tensor<1x1x1x8192xf32> | |
} | |
scf.yield %18 : tensor<1x1x1x8192xf32> | |
} | |
%reduced = linalg.reduce ins(%12 : tensor<1x1x1x8192xf32>) outs(%9 : tensor<1x1xf32>) dimensions = [2, 3] | |
(%in: f32, %init: f32) { | |
%18 = arith.addf %in, %init : f32 | |
linalg.yield %18 : f32 | |
} | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%reduced : tensor<1x1xf32>) outs(%8 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<1x1xf32> | |
%14 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%15 = linalg.fill ins(%cst_1 : f32) outs(%14 : tensor<1x1x1x8192xf32>) -> tensor<1x1x1x8192xf32> | |
%16 = scf.for %arg4 = %c0 to %c10 step %c1 iter_args(%arg5 = %15) -> (tensor<1x1x1x8192xf32>) { | |
%18 = scf.for %arg6 = %c0 to %c16384 step %c8192 iter_args(%arg7 = %arg5) -> (tensor<1x1x1x8192xf32>) { | |
%extracted_slice_4 = tensor.extract_slice %5[%arg0, %arg1, %arg4, %arg6] [1, 1, 1, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x1x8192xf16> | |
%19 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x1x8192xf16>) outs(%19 : tensor<1x1x1x8192xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%22 = arith.extf %in : f16 to f32 | |
linalg.yield %22 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20, %13 : tensor<1x1x1x8192xf32>, tensor<1x1xf32>) outs(%arg7 : tensor<1x1x1x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_5: f32, %out: f32): | |
%22 = arith.subf %in, %in_5 : f32 | |
%23 = arith.mulf %22, %22 : f32 | |
%24 = arith.addf %23, %out : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
scf.yield %21 : tensor<1x1x1x8192xf32> | |
} | |
scf.yield %18 : tensor<1x1x1x8192xf32> | |
} | |
%reduced_2 = linalg.reduce ins(%16 : tensor<1x1x1x8192xf32>) outs(%9 : tensor<1x1xf32>) dimensions = [2, 3] | |
(%in: f32, %init: f32) { | |
%18 = arith.addf %in, %init : f32 | |
linalg.yield %18 : f32 | |
} | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %reduced_2 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%18 = arith.divf %in_5, %cst_0 : f32 | |
%19 = arith.addf %18, %cst : f32 | |
%20 = math.rsqrt %19 : f32 | |
%21 = arith.extf %in : f16 to f32 | |
%22 = arith.subf %21, %in_4 : f32 | |
%23 = arith.mulf %22, %20 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After LoopCoalescing (affine-loop-coalescing) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c8192 = arith.constant 8192 : index | |
%c1 = arith.constant 1 : index | |
%c16384 = arith.constant 16384 : index | |
%c10 = arith.constant 10 : index | |
%c0 = arith.constant 0 : index | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%8 = tensor.empty() : tensor<1x1xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%10 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1x1x8192xf32>) -> tensor<1x1x1x8192xf32> | |
%c0_2 = arith.constant 0 : index | |
%c10_3 = arith.constant 10 : index | |
%c1_4 = arith.constant 1 : index | |
%c0_5 = arith.constant 0 : index | |
%c2 = arith.constant 2 : index | |
%c1_6 = arith.constant 1 : index | |
%c20 = arith.constant 20 : index | |
%12 = scf.for %arg4 = %c0_2 to %c20 step %c1_4 iter_args(%arg5 = %11) -> (tensor<1x1x1x8192xf32>) { | |
%18:2 = affine.delinearize_index %arg4 into (10, 2) : index, index | |
%19 = affine.apply affine_map<(d0) -> (d0 * 8192)>(%18#1) | |
%extracted_slice_16 = tensor.extract_slice %5[%arg0, %arg1, %18#0, %19] [1, 1, 1, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x1x8192xf16> | |
%20 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_16 : tensor<1x1x1x8192xf16>) outs(%20 : tensor<1x1x1x8192xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%23 = arith.extf %in : f16 to f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21 : tensor<1x1x1x8192xf32>) outs(%arg5 : tensor<1x1x1x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.addf %in, %out : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
scf.yield %22 : tensor<1x1x1x8192xf32> | |
} | |
%reduced = linalg.reduce ins(%12 : tensor<1x1x1x8192xf32>) outs(%9 : tensor<1x1xf32>) dimensions = [2, 3] | |
(%in: f32, %init: f32) { | |
%18 = arith.addf %in, %init : f32 | |
linalg.yield %18 : f32 | |
} | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%reduced : tensor<1x1xf32>) outs(%8 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<1x1xf32> | |
%14 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%15 = linalg.fill ins(%cst_1 : f32) outs(%14 : tensor<1x1x1x8192xf32>) -> tensor<1x1x1x8192xf32> | |
%c0_7 = arith.constant 0 : index | |
%c10_8 = arith.constant 10 : index | |
%c1_9 = arith.constant 1 : index | |
%c0_10 = arith.constant 0 : index | |
%c2_11 = arith.constant 2 : index | |
%c1_12 = arith.constant 1 : index | |
%c20_13 = arith.constant 20 : index | |
%16 = scf.for %arg4 = %c0_7 to %c20_13 step %c1_9 iter_args(%arg5 = %15) -> (tensor<1x1x1x8192xf32>) { | |
%18:2 = affine.delinearize_index %arg4 into (10, 2) : index, index | |
%19 = affine.apply affine_map<(d0) -> (d0 * 8192)>(%18#1) | |
%extracted_slice_16 = tensor.extract_slice %5[%arg0, %arg1, %18#0, %19] [1, 1, 1, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x1x8192xf16> | |
%20 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_16 : tensor<1x1x1x8192xf16>) outs(%20 : tensor<1x1x1x8192xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%23 = arith.extf %in : f16 to f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %13 : tensor<1x1x1x8192xf32>, tensor<1x1xf32>) outs(%arg5 : tensor<1x1x1x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_17: f32, %out: f32): | |
%23 = arith.subf %in, %in_17 : f32 | |
%24 = arith.mulf %23, %23 : f32 | |
%25 = arith.addf %24, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
scf.yield %22 : tensor<1x1x1x8192xf32> | |
} | |
%reduced_14 = linalg.reduce ins(%16 : tensor<1x1x1x8192xf32>) outs(%9 : tensor<1x1xf32>) dimensions = [2, 3] | |
(%in: f32, %init: f32) { | |
%18 = arith.addf %in, %init : f32 | |
linalg.yield %18 : f32 | |
} | |
%extracted_slice_15 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %reduced_14 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_15 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_16: f32, %in_17: f32, %out: f32): | |
%18 = arith.divf %in_17, %cst_0 : f32 | |
%19 = arith.addf %18, %cst : f32 | |
%20 = math.rsqrt %19 : f32 | |
%21 = arith.extf %in : f16 to f32 | |
%22 = arith.subf %21, %in_16 : f32 | |
%23 = arith.mulf %22, %20 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c20 = arith.constant 20 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%8 = tensor.empty() : tensor<1x1xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%10 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1x1x8192xf32>) -> tensor<1x1x1x8192xf32> | |
%12 = scf.for %arg4 = %c0 to %c20 step %c1 iter_args(%arg5 = %11) -> (tensor<1x1x1x8192xf32>) { | |
%18:2 = affine.delinearize_index %arg4 into (10, 2) : index, index | |
%19 = affine.apply affine_map<(d0) -> (d0 * 8192)>(%18#1) | |
%extracted_slice_4 = tensor.extract_slice %5[%arg0, %arg1, %18#0, %19] [1, 1, 1, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x1x8192xf16> | |
%20 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x1x8192xf16>) outs(%20 : tensor<1x1x1x8192xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%23 = arith.extf %in : f16 to f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21 : tensor<1x1x1x8192xf32>) outs(%arg5 : tensor<1x1x1x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.addf %in, %out : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
scf.yield %22 : tensor<1x1x1x8192xf32> | |
} | |
%reduced = linalg.reduce ins(%12 : tensor<1x1x1x8192xf32>) outs(%9 : tensor<1x1xf32>) dimensions = [2, 3] | |
(%in: f32, %init: f32) { | |
%18 = arith.addf %in, %init : f32 | |
linalg.yield %18 : f32 | |
} | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%reduced : tensor<1x1xf32>) outs(%8 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<1x1xf32> | |
%14 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%15 = linalg.fill ins(%cst_1 : f32) outs(%14 : tensor<1x1x1x8192xf32>) -> tensor<1x1x1x8192xf32> | |
%16 = scf.for %arg4 = %c0 to %c20 step %c1 iter_args(%arg5 = %15) -> (tensor<1x1x1x8192xf32>) { | |
%18:2 = affine.delinearize_index %arg4 into (10, 2) : index, index | |
%19 = affine.apply affine_map<(d0) -> (d0 * 8192)>(%18#1) | |
%extracted_slice_4 = tensor.extract_slice %5[%arg0, %arg1, %18#0, %19] [1, 1, 1, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x1x8192xf16> | |
%20 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x1x8192xf16>) outs(%20 : tensor<1x1x1x8192xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%23 = arith.extf %in : f16 to f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %13 : tensor<1x1x1x8192xf32>, tensor<1x1xf32>) outs(%arg5 : tensor<1x1x1x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_5: f32, %out: f32): | |
%23 = arith.subf %in, %in_5 : f32 | |
%24 = arith.mulf %23, %23 : f32 | |
%25 = arith.addf %24, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
scf.yield %22 : tensor<1x1x1x8192xf32> | |
} | |
%reduced_2 = linalg.reduce ins(%16 : tensor<1x1x1x8192xf32>) outs(%9 : tensor<1x1xf32>) dimensions = [2, 3] | |
(%in: f32, %init: f32) { | |
%18 = arith.addf %in, %init : f32 | |
linalg.yield %18 : f32 | |
} | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %reduced_2 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%18 = arith.divf %in_5, %cst_0 : f32 | |
%19 = arith.addf %18, %cst : f32 | |
%20 = math.rsqrt %19 : f32 | |
%21 = arith.extf %in : f16 to f32 | |
%22 = arith.subf %21, %in_4 : f32 | |
%23 = arith.mulf %22, %20 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c20 = arith.constant 20 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%8 = tensor.empty() : tensor<1x1xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%10 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1x1x8192xf32>) -> tensor<1x1x1x8192xf32> | |
%12 = scf.for %arg4 = %c0 to %c20 step %c1 iter_args(%arg5 = %11) -> (tensor<1x1x1x8192xf32>) { | |
%16:2 = affine.delinearize_index %arg4 into (10, 2) : index, index | |
%17 = affine.apply affine_map<(d0) -> (d0 * 8192)>(%16#1) | |
%extracted_slice_4 = tensor.extract_slice %5[%arg0, %arg1, %16#0, %17] [1, 1, 1, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x1x8192xf16> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x1x8192xf16>) outs(%10 : tensor<1x1x1x8192xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%20 = arith.extf %in : f16 to f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18 : tensor<1x1x1x8192xf32>) outs(%arg5 : tensor<1x1x1x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%20 = arith.addf %in, %out : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
scf.yield %19 : tensor<1x1x1x8192xf32> | |
} | |
%reduced = linalg.reduce ins(%12 : tensor<1x1x1x8192xf32>) outs(%9 : tensor<1x1xf32>) dimensions = [2, 3] | |
(%in: f32, %init: f32) { | |
%16 = arith.addf %in, %init : f32 | |
linalg.yield %16 : f32 | |
} | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%reduced : tensor<1x1xf32>) outs(%8 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%14 = scf.for %arg4 = %c0 to %c20 step %c1 iter_args(%arg5 = %11) -> (tensor<1x1x1x8192xf32>) { | |
%16:2 = affine.delinearize_index %arg4 into (10, 2) : index, index | |
%17 = affine.apply affine_map<(d0) -> (d0 * 8192)>(%16#1) | |
%extracted_slice_4 = tensor.extract_slice %5[%arg0, %arg1, %16#0, %17] [1, 1, 1, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x1x8192xf16> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x1x8192xf16>) outs(%10 : tensor<1x1x1x8192xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%20 = arith.extf %in : f16 to f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18, %13 : tensor<1x1x1x8192xf32>, tensor<1x1xf32>) outs(%arg5 : tensor<1x1x1x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_5: f32, %out: f32): | |
%20 = arith.subf %in, %in_5 : f32 | |
%21 = arith.mulf %20, %20 : f32 | |
%22 = arith.addf %21, %out : f32 | |
linalg.yield %22 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
scf.yield %19 : tensor<1x1x1x8192xf32> | |
} | |
%reduced_2 = linalg.reduce ins(%14 : tensor<1x1x1x8192xf32>) outs(%9 : tensor<1x1xf32>) dimensions = [2, 3] | |
(%in: f32, %init: f32) { | |
%16 = arith.addf %in, %init : f32 | |
linalg.yield %16 : f32 | |
} | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %reduced_2 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%16 = arith.divf %in_5, %cst_0 : f32 | |
%17 = arith.addf %16, %cst : f32 | |
%18 = math.rsqrt %17 : f32 | |
%19 = arith.extf %in : f16 to f32 | |
%20 = arith.subf %19, %in_4 : f32 | |
%21 = arith.mulf %20, %18 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After DecomposeAttentionPass (iree-linalg-ext-decompose-attention) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c20 = arith.constant 20 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%8 = tensor.empty() : tensor<1x1xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%10 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1x1x8192xf32>) -> tensor<1x1x1x8192xf32> | |
%12 = scf.for %arg4 = %c0 to %c20 step %c1 iter_args(%arg5 = %11) -> (tensor<1x1x1x8192xf32>) { | |
%16:2 = affine.delinearize_index %arg4 into (10, 2) : index, index | |
%17 = affine.apply affine_map<(d0) -> (d0 * 8192)>(%16#1) | |
%extracted_slice_4 = tensor.extract_slice %5[%arg0, %arg1, %16#0, %17] [1, 1, 1, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x1x8192xf16> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x1x8192xf16>) outs(%10 : tensor<1x1x1x8192xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%20 = arith.extf %in : f16 to f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18 : tensor<1x1x1x8192xf32>) outs(%arg5 : tensor<1x1x1x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%20 = arith.addf %in, %out : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
scf.yield %19 : tensor<1x1x1x8192xf32> | |
} | |
%reduced = linalg.reduce ins(%12 : tensor<1x1x1x8192xf32>) outs(%9 : tensor<1x1xf32>) dimensions = [2, 3] | |
(%in: f32, %init: f32) { | |
%16 = arith.addf %in, %init : f32 | |
linalg.yield %16 : f32 | |
} | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%reduced : tensor<1x1xf32>) outs(%8 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%14 = scf.for %arg4 = %c0 to %c20 step %c1 iter_args(%arg5 = %11) -> (tensor<1x1x1x8192xf32>) { | |
%16:2 = affine.delinearize_index %arg4 into (10, 2) : index, index | |
%17 = affine.apply affine_map<(d0) -> (d0 * 8192)>(%16#1) | |
%extracted_slice_4 = tensor.extract_slice %5[%arg0, %arg1, %16#0, %17] [1, 1, 1, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x1x8192xf16> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x1x8192xf16>) outs(%10 : tensor<1x1x1x8192xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%20 = arith.extf %in : f16 to f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18, %13 : tensor<1x1x1x8192xf32>, tensor<1x1xf32>) outs(%arg5 : tensor<1x1x1x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_5: f32, %out: f32): | |
%20 = arith.subf %in, %in_5 : f32 | |
%21 = arith.mulf %20, %20 : f32 | |
%22 = arith.addf %21, %out : f32 | |
linalg.yield %22 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
scf.yield %19 : tensor<1x1x1x8192xf32> | |
} | |
%reduced_2 = linalg.reduce ins(%14 : tensor<1x1x1x8192xf32>) outs(%9 : tensor<1x1xf32>) dimensions = [2, 3] | |
(%in: f32, %init: f32) { | |
%16 = arith.addf %in, %init : f32 | |
linalg.yield %16 : f32 | |
} | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %reduced_2 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%16 = arith.divf %in_5, %cst_0 : f32 | |
%17 = arith.addf %16, %cst : f32 | |
%18 = math.rsqrt %17 : f32 | |
%19 = arith.extf %in : f16 to f32 | |
%20 = arith.subf %19, %in_4 : f32 | |
%21 = arith.mulf %20, %18 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c20 = arith.constant 20 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%8 = tensor.empty() : tensor<1x1xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%10 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1x1x8192xf32>) -> tensor<1x1x1x8192xf32> | |
%12 = scf.for %arg4 = %c0 to %c20 step %c1 iter_args(%arg5 = %11) -> (tensor<1x1x1x8192xf32>) { | |
%16:2 = affine.delinearize_index %arg4 into (10, 2) : index, index | |
%17 = affine.apply affine_map<(d0) -> (d0 * 8192)>(%16#1) | |
%extracted_slice_4 = tensor.extract_slice %5[%arg0, %arg1, %16#0, %17] [1, 1, 1, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x1x8192xf16> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x1x8192xf16>) outs(%10 : tensor<1x1x1x8192xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%20 = arith.extf %in : f16 to f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18 : tensor<1x1x1x8192xf32>) outs(%arg5 : tensor<1x1x1x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%20 = arith.addf %in, %out : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
scf.yield %19 : tensor<1x1x1x8192xf32> | |
} | |
%reduced = linalg.reduce ins(%12 : tensor<1x1x1x8192xf32>) outs(%9 : tensor<1x1xf32>) dimensions = [2, 3] | |
(%in: f32, %init: f32) { | |
%16 = arith.addf %in, %init : f32 | |
linalg.yield %16 : f32 | |
} | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%reduced : tensor<1x1xf32>) outs(%8 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%14 = scf.for %arg4 = %c0 to %c20 step %c1 iter_args(%arg5 = %11) -> (tensor<1x1x1x8192xf32>) { | |
%16:2 = affine.delinearize_index %arg4 into (10, 2) : index, index | |
%17 = affine.apply affine_map<(d0) -> (d0 * 8192)>(%16#1) | |
%extracted_slice_4 = tensor.extract_slice %5[%arg0, %arg1, %16#0, %17] [1, 1, 1, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x1x8192xf16> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x1x8192xf16>) outs(%10 : tensor<1x1x1x8192xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%20 = arith.extf %in : f16 to f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18, %13 : tensor<1x1x1x8192xf32>, tensor<1x1xf32>) outs(%arg5 : tensor<1x1x1x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_5: f32, %out: f32): | |
%20 = arith.subf %in, %in_5 : f32 | |
%21 = arith.mulf %20, %20 : f32 | |
%22 = arith.addf %21, %out : f32 | |
linalg.yield %22 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
scf.yield %19 : tensor<1x1x1x8192xf32> | |
} | |
%reduced_2 = linalg.reduce ins(%14 : tensor<1x1x1x8192xf32>) outs(%9 : tensor<1x1xf32>) dimensions = [2, 3] | |
(%in: f32, %init: f32) { | |
%16 = arith.addf %in, %init : f32 | |
linalg.yield %16 : f32 | |
} | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %reduced_2 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%16 = arith.divf %in_5, %cst_0 : f32 | |
%17 = arith.addf %16, %cst : f32 | |
%18 = math.rsqrt %17 : f32 | |
%19 = arith.extf %in : f16 to f32 | |
%20 = arith.subf %19, %in_4 : f32 | |
%21 = arith.mulf %20, %18 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c20 = arith.constant 20 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 43328000, umax = 43328000, udiv = 43328000>, <umin = 0, umax = 0>] : index | |
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c22356480) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> | |
%4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
%5 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 32, 16384) step (1, 1, 8192) shared_outs(%arg3 = %6) -> (tensor<2x32x10x16384xf32>) { | |
%extracted_slice = tensor.extract_slice %5[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x10x8192xf16> | |
%8 = tensor.empty() : tensor<1x1xf32> | |
%9 = linalg.fill ins(%cst_1 : f32) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32> | |
%10 = tensor.empty() : tensor<1x1x1x8192xf32> | |
%11 = linalg.fill ins(%cst_1 : f32) outs(%10 : tensor<1x1x1x8192xf32>) -> tensor<1x1x1x8192xf32> | |
%12 = scf.for %arg4 = %c0 to %c20 step %c1 iter_args(%arg5 = %11) -> (tensor<1x1x1x8192xf32>) { | |
%16:2 = affine.delinearize_index %arg4 into (10, 2) : index, index | |
%17 = affine.apply affine_map<(d0) -> (d0 * 8192)>(%16#1) | |
%extracted_slice_4 = tensor.extract_slice %5[%arg0, %arg1, %16#0, %17] [1, 1, 1, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x1x8192xf16> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x1x8192xf16>) outs(%10 : tensor<1x1x1x8192xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%20 = arith.extf %in : f16 to f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18 : tensor<1x1x1x8192xf32>) outs(%arg5 : tensor<1x1x1x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %out: f32): | |
%20 = arith.addf %in, %out : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
scf.yield %19 : tensor<1x1x1x8192xf32> | |
} | |
%reduced = linalg.reduce ins(%12 : tensor<1x1x1x8192xf32>) outs(%9 : tensor<1x1xf32>) dimensions = [2, 3] | |
(%in: f32, %init: f32) { | |
%16 = arith.addf %in, %init : f32 | |
linalg.yield %16 : f32 | |
} | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%reduced : tensor<1x1xf32>) outs(%8 : tensor<1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%16 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<1x1xf32> | |
%14 = scf.for %arg4 = %c0 to %c20 step %c1 iter_args(%arg5 = %11) -> (tensor<1x1x1x8192xf32>) { | |
%16:2 = affine.delinearize_index %arg4 into (10, 2) : index, index | |
%17 = affine.apply affine_map<(d0) -> (d0 * 8192)>(%16#1) | |
%extracted_slice_4 = tensor.extract_slice %5[%arg0, %arg1, %16#0, %17] [1, 1, 1, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf16> to tensor<1x1x1x8192xf16> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x1x8192xf16>) outs(%10 : tensor<1x1x1x8192xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%20 = arith.extf %in : f16 to f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18, %13 : tensor<1x1x1x8192xf32>, tensor<1x1xf32>) outs(%arg5 : tensor<1x1x1x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 1, 8192], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 1, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 0]}>} { | |
^bb0(%in: f32, %in_5: f32, %out: f32): | |
%20 = arith.subf %in, %in_5 : f32 | |
%21 = arith.mulf %20, %20 : f32 | |
%22 = arith.addf %21, %out : f32 | |
linalg.yield %22 : f32 | |
} -> tensor<1x1x1x8192xf32> | |
scf.yield %19 : tensor<1x1x1x8192xf32> | |
} | |
%reduced_2 = linalg.reduce ins(%14 : tensor<1x1x1x8192xf32>) outs(%9 : tensor<1x1xf32>) dimensions = [2, 3] | |
(%in: f32, %init: f32) { | |
%16 = arith.addf %in, %init : f32 | |
linalg.yield %16 : f32 | |
} | |
%extracted_slice_3 = tensor.extract_slice %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<2x32x10x16384xf32> to tensor<1x1x10x8192xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %13, %reduced_2 : tensor<1x1x10x8192xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x10x8192xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 8], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 0, 8192]}>} { | |
^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32): | |
%16 = arith.divf %in_5, %cst_0 : f32 | |
%17 = arith.addf %16, %cst : f32 | |
%18 = math.rsqrt %17 : f32 | |
%19 = arith.extf %in : f16 to f32 | |
%20 = arith.subf %19, %in_4 : f32 | |
%21 = arith.mulf %20, %18 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x10x8192xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %15 into %arg3[%arg0, %arg1, 0, %arg2] [1, 1, 10, 8192] [1, 1, 1, 1] : tensor<1x1x10x8192xf32> into tensor<2x32x10x16384xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
// -----// IR Dump After DecomposeHorizontallyFusedGemmsPass (iree-codegen-gpu-decompose-horizontally-fused-gemms) //----- // | |
func.func @main$async_dispatch_14_elementwise_2x32x10x16384_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} { | |
%c20 = arith.constant 20 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c22356480 = arith.constant 22356480 : index | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = arith.index_castui %0 : i32 to index | |
%2 = util.assume.int %1[<umin = 43328000, umax = 43328000, udiv = 43328000 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment