Skip to content

Instantly share code, notes, and snippets.

@bjacob
Created April 24, 2025 19:25
Show Gist options
  • Save bjacob/abc2f4f959374260f9232e1c2af70576 to your computer and use it in GitHub Desktop.
Save bjacob/abc2f4f959374260f9232e1c2af70576 to your computer and use it in GitHub Desktop.
// -----// IR Dump After LoweringConfigInterpreterPass (iree-codegen-lowering-config-interpreter) //----- //
func.func @matmul_dispatch_0_matmul_like_Dx256x4096x4096_f8E4M3FNUZxf8E4M3FNUZxf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>, llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = arith.extui %0 : i32 to i64
%3 = arith.extui %1 : i32 to i64
%4 = arith.shli %3, %c32_i64 : i64
%5 = arith.ori %2, %4 : i64
%6 = arith.index_castui %5 : i64 to index
%7 = util.assume.int %6<umin = 0, umax = 35184372088831> : index
%8 = hal.interface.binding.subspan layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>
%9 = flow.dispatch.workload.ordinal %7, 0 : index
%10 = hal.interface.binding.subspan layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x256x4096xf8E4M3FNUZ>>{%9}
%11 = hal.interface.binding.subspan layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x256x4096xf32>>{%9}
%12 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [%9, 256, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x256x4096xf8E4M3FNUZ>>{%9} -> tensor<?x256x4096xf8E4M3FNUZ>
%13 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
%14 = tensor.empty(%9) : tensor<?x256x4096xf32>
%15 = scf.forall (%arg0, %arg1) = (0, 0) to (%9, 4096) step (1, 256) shared_outs(%arg2 = %14) -> (tensor<?x256x4096xf32>) {
%extracted_slice = tensor.extract_slice %12[%arg0, 0, 0] [1, 256, 4096] [1, 1, 1] : tensor<?x256x4096xf8E4M3FNUZ> to tensor<1x256x4096xf8E4M3FNUZ>
%extracted_slice_0 = tensor.extract_slice %13[%arg1, 0] [256, 4096] [1, 1] : tensor<4096x4096xf8E4M3FNUZ> to tensor<256x4096xf8E4M3FNUZ>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, 0, %arg1] [1, 256, 256] [1, 1, 1] : tensor<?x256x4096xf32> to tensor<1x256x256xf32>
%16 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%extracted_slice, %extracted_slice_0 : tensor<1x256x4096xf8E4M3FNUZ>, tensor<256x4096xf8E4M3FNUZ>) outs(%16 : tensor<1x256x256xf32>) attrs = {__tuning_spec_applied__, lowering_config = #iree_gpu.lowering_config<{lowering_strategy = "cast_and_call_expanded_f8_pingpong_matmul", workgroup = [1, 256, 256, 0]}>} {
^bb0(%in: f8E4M3FNUZ, %in_11: f8E4M3FNUZ, %out: f32):
%24 = arith.extf %in : f8E4M3FNUZ to f32
%25 = arith.extf %in_11 : f8E4M3FNUZ to f32
%26 = arith.mulf %24, %25 : f32
%27 = arith.addf %out, %26 : f32
linalg.yield %27 : f32
} -> tensor<1x256x256xf32>
%cast = tensor.cast %extracted_slice : tensor<1x256x4096xf8E4M3FNUZ> to tensor<1x256x?xf8E4M3FNUZ>
%cast_2 = tensor.cast %extracted_slice_0 : tensor<256x4096xf8E4M3FNUZ> to tensor<256x?xf8E4M3FNUZ>
%c0_3 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst_4 = arith.constant 0.000000e+00 : f8E4M3FNUZ
%alloc = memref.alloc() : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>>
%alloc_5 = memref.alloc() : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>>
%dim = tensor.dim %cast_2, %c1 : tensor<256x?xf8E4M3FNUZ>
%18 = iree_gpu.buffer_resource_cast %cast cacheSwizzleStride(%dim) : tensor<1x256x?xf8E4M3FNUZ>
%19 = iree_gpu.buffer_resource_cast %cast_2 cacheSwizzleStride(%dim) : tensor<256x?xf8E4M3FNUZ>
%20 = iree_codegen.swizzle_hint %alloc[#iree_codegen.rotate_rows<128, 8>] : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>>
%21 = iree_codegen.swizzle_hint %alloc_5[#iree_codegen.rotate_rows<128, 8>] : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>>
%expand_shape = memref.expand_shape %20 [[0, 1]] output_shape [256, 128] : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>> into memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
%expand_shape_6 = memref.expand_shape %21 [[0, 1]] output_shape [256, 128] : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>> into memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
%extracted_slice_7 = tensor.extract_slice %18[0, 0, 0] [1, 256, 128] [1, 1, 1] : tensor<1x256x?xf8E4M3FNUZ> to tensor<1x256x128xf8E4M3FNUZ>
%extracted_slice_8 = tensor.extract_slice %19[0, 0] [256, 128] [1, 1] : tensor<256x?xf8E4M3FNUZ> to tensor<256x128xf8E4M3FNUZ>
scf.forall (%arg3) in (2048) {
%24:2 = affine.delinearize_index %arg3 into (256, 8) : index, index
%25 = arith.muli %24#1, %c8 : index
%extracted_slice_11 = tensor.extract_slice %extracted_slice_7[0, %24#0, %25] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ>
%26 = vector.transfer_read %extracted_slice_11[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
vector.transfer_write %26, %expand_shape[%24#0, %25] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
} {mapping = [#gpu.thread<linear_dim_0>]}
scf.forall (%arg3) in (2048) {
%24:2 = affine.delinearize_index %arg3 into (256, 8) : index, index
%25 = arith.muli %24#1, %c8 : index
%extracted_slice_11 = tensor.extract_slice %extracted_slice_8[%24#0, %25] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ>
%26 = vector.transfer_read %extracted_slice_11[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
vector.transfer_write %26, %expand_shape_6[%24#0, %25] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
} {mapping = [#gpu.thread<linear_dim_0>]}
%expand_shape_9 = memref.expand_shape %expand_shape [[0, 1], [2, 3]] output_shape [16, 16, 4, 32] : memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> into memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
%expand_shape_10 = memref.expand_shape %expand_shape_6 [[0, 1], [2, 3]] output_shape [16, 16, 4, 32] : memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> into memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
%22 = tensor.empty() : tensor<1x16x16x16x16xf32>
%23 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %22) -> (tensor<1x16x16x16x16xf32>) {
%24:4 = affine.delinearize_index %arg3 into (2, 4, 4, 16) : index, index, index, index
%25 = arith.muli %24#2, %c8 : index
%26 = arith.muli %24#0, %c8 : index
%27 = arith.muli %24#1, %c4 : index
%28:2 = affine.delinearize_index %arg3 into (64, 8) : index, index
%29:3 = affine.delinearize_index %arg3 into (8, 8, 8) : index, index, index
%30 = arith.muli %29#2, %c16 : index
%31 = arith.muli %29#0, %c32 : index
%32 = arith.addi %29#1, %31 : index
%33 = arith.addi %32, %c8 : index
%34 = arith.addi %33, %c8 : index
%35 = arith.addi %34, %c8 : index
%cst_11 = arith.constant dense<0.000000e+00> : vector<8x4x1x4xf32>
%36 = arith.cmpi slt, %arg3, %c256 : index
%37 = arith.cmpi sge, %arg3, %c256 : index
scf.if %36 {
rocdl.s.barrier
}
%38 = scf.for %arg5 = %c128 to %dim step %c128 iter_args(%arg6 = %cst_11) -> (vector<8x4x1x4xf32>) {
%extracted_slice_12 = tensor.extract_slice %18[0, 0, %arg5] [1, 256, 128] [1, 1, 1] : tensor<1x256x?xf8E4M3FNUZ> to tensor<1x256x128xf8E4M3FNUZ>
%extracted_slice_13 = tensor.extract_slice %extracted_slice_12[0, %32, %30] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ>
%54 = vector.transfer_read %extracted_slice_13[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
%extracted_slice_14 = tensor.extract_slice %extracted_slice_12[0, %33, %30] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ>
%55 = vector.transfer_read %extracted_slice_14[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
%extracted_slice_15 = tensor.extract_slice %extracted_slice_12[0, %34, %30] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ>
%56 = vector.transfer_read %extracted_slice_15[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
%extracted_slice_16 = tensor.extract_slice %extracted_slice_12[0, %35, %30] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ>
%57 = vector.transfer_read %extracted_slice_16[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
%58 = vector.transfer_read %expand_shape_9[%26, %24#3, %c0_3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
%59 = vector.transfer_read %expand_shape_10[%27, %24#3, %c0_3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
gpu.barrier
rocdl.sched.barrier 0
rocdl.s.setprio 1 {iree_gpu.swap_mfma = 1 : i64}
%60 = iree_gpu.multi_mma %58, %59, %arg6 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
gpu.barrier
rocdl.sched.barrier 0
%extracted_slice_17 = tensor.extract_slice %19[0, %arg5] [256, 128] [1, 1] : tensor<256x?xf8E4M3FNUZ> to tensor<256x128xf8E4M3FNUZ>
%extracted_slice_18 = tensor.extract_slice %extracted_slice_17[%32, %30] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ>
%61 = vector.transfer_read %extracted_slice_18[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
%extracted_slice_19 = tensor.extract_slice %extracted_slice_17[%33, %30] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ>
%62 = vector.transfer_read %extracted_slice_19[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
%extracted_slice_20 = tensor.extract_slice %extracted_slice_17[%34, %30] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ>
%63 = vector.transfer_read %extracted_slice_20[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
%extracted_slice_21 = tensor.extract_slice %extracted_slice_17[%35, %30] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ>
%64 = vector.transfer_read %extracted_slice_21[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
%65 = vector.transfer_read %expand_shape_9[%26, %24#3, %c1, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
%66 = vector.transfer_read %expand_shape_10[%27, %24#3, %c1, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
gpu.barrier
rocdl.sched.barrier 0
rocdl.s.setprio 1 {iree_gpu.swap_mfma = 1 : i64}
%67 = iree_gpu.multi_mma %65, %66, %60 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
gpu.barrier
rocdl.sched.barrier 0
%68 = vector.transfer_read %expand_shape_9[%26, %24#3, %c2, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
%69 = vector.transfer_read %expand_shape_10[%27, %24#3, %c2, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
%70 = vector.transfer_read %expand_shape_9[%26, %24#3, %c3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
%71 = vector.transfer_read %expand_shape_10[%27, %24#3, %c3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
gpu.barrier
rocdl.sched.barrier 0
rocdl.s.setprio 1 {iree_gpu.swap_mfma = 1 : i64}
%72 = iree_gpu.multi_mma %68, %69, %67 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
gpu.barrier
rocdl.sched.barrier 0
vector.transfer_write %54, %expand_shape[%32, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
vector.transfer_write %55, %expand_shape[%33, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
vector.transfer_write %56, %expand_shape[%34, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
vector.transfer_write %57, %expand_shape[%35, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
vector.transfer_write %61, %expand_shape_6[%32, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
vector.transfer_write %62, %expand_shape_6[%33, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
vector.transfer_write %63, %expand_shape_6[%34, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
vector.transfer_write %64, %expand_shape_6[%35, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
gpu.barrier
rocdl.sched.barrier 0
rocdl.s.setprio 1 {iree_gpu.swap_mfma = 1 : i64}
%73 = iree_gpu.multi_mma %70, %71, %72 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
gpu.barrier
rocdl.sched.barrier 0
scf.yield %73 : vector<8x4x1x4xf32>
}
scf.if %37 {
rocdl.s.barrier
}
%39 = vector.transfer_read %expand_shape_9[%26, %24#3, %c0_3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
%40 = vector.transfer_read %expand_shape_10[%27, %24#3, %c0_3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
%41 = iree_gpu.multi_mma %39, %40, %38 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
%42 = vector.transfer_read %expand_shape_9[%26, %24#3, %c1, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
%43 = vector.transfer_read %expand_shape_10[%27, %24#3, %c1, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
%44 = iree_gpu.multi_mma %42, %43, %41 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
%45 = vector.transfer_read %expand_shape_9[%26, %24#3, %c2, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
%46 = vector.transfer_read %expand_shape_10[%27, %24#3, %c2, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
%47 = iree_gpu.multi_mma %45, %46, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
%48 = vector.transfer_read %expand_shape_9[%26, %24#3, %c3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ>
%49 = vector.transfer_read %expand_shape_10[%27, %24#3, %c3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ>
%50 = iree_gpu.multi_mma %48, %49, %47 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
%51 = vector.transpose %50, [0, 2, 1, 3] : vector<8x4x1x4xf32> to vector<8x1x4x4xf32>
%52 = tensor.empty() : tensor<1x8x1x4x4xf32>
%53 = vector.transfer_write %51, %52[%c0_3, %c0_3, %c0_3, %c0_3, %c0_3] {in_bounds = [true, true, true, true]} : vector<8x1x4x4xf32>, tensor<1x8x1x4x4xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %53 into %arg4[0, %26, %24#3, %27, %25] [1, 8, 1, 4, 4] [1, 1, 1, 1, 1] : tensor<1x8x1x4x4xf32> into tensor<1x16x16x16x16xf32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
%collapsed = tensor.collapse_shape %23 [[0], [1, 2], [3, 4]] : tensor<1x16x16x16x16xf32> into tensor<1x256x256xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %collapsed into %arg2[%arg0, 0, %arg1] [1, 256, 256] [1, 1, 1] : tensor<1x256x256xf32> into tensor<?x256x4096xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %15, %11, offsets = [0, 0, 0], sizes = [%9, 256, 4096], strides = [1, 1, 1] : tensor<?x256x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x256x4096xf32>>{%9}
return
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment