Created
April 24, 2025 19:25
-
-
Save bjacob/abc2f4f959374260f9232e1c2af70576 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After LoweringConfigInterpreterPass (iree-codegen-lowering-config-interpreter) //----- // | |
func.func @matmul_dispatch_0_matmul_like_Dx256x4096x4096_f8E4M3FNUZxf8E4M3FNUZxf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>, llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>} { | |
%c0 = arith.constant 0 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = arith.extui %0 : i32 to i64 | |
%3 = arith.extui %1 : i32 to i64 | |
%4 = arith.shli %3, %c32_i64 : i64 | |
%5 = arith.ori %2, %4 : i64 | |
%6 = arith.index_castui %5 : i64 to index | |
%7 = util.assume.int %6<umin = 0, umax = 35184372088831> : index | |
%8 = hal.interface.binding.subspan layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> | |
%9 = flow.dispatch.workload.ordinal %7, 0 : index | |
%10 = hal.interface.binding.subspan layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x256x4096xf8E4M3FNUZ>>{%9} | |
%11 = hal.interface.binding.subspan layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x256x4096xf32>>{%9} | |
%12 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [%9, 256, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x256x4096xf8E4M3FNUZ>>{%9} -> tensor<?x256x4096xf8E4M3FNUZ> | |
%13 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ> | |
%14 = tensor.empty(%9) : tensor<?x256x4096xf32> | |
%15 = scf.forall (%arg0, %arg1) = (0, 0) to (%9, 4096) step (1, 256) shared_outs(%arg2 = %14) -> (tensor<?x256x4096xf32>) { | |
%extracted_slice = tensor.extract_slice %12[%arg0, 0, 0] [1, 256, 4096] [1, 1, 1] : tensor<?x256x4096xf8E4M3FNUZ> to tensor<1x256x4096xf8E4M3FNUZ> | |
%extracted_slice_0 = tensor.extract_slice %13[%arg1, 0] [256, 4096] [1, 1] : tensor<4096x4096xf8E4M3FNUZ> to tensor<256x4096xf8E4M3FNUZ> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, 0, %arg1] [1, 256, 256] [1, 1, 1] : tensor<?x256x4096xf32> to tensor<1x256x256xf32> | |
%16 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<1x256x256xf32>) -> tensor<1x256x256xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%extracted_slice, %extracted_slice_0 : tensor<1x256x4096xf8E4M3FNUZ>, tensor<256x4096xf8E4M3FNUZ>) outs(%16 : tensor<1x256x256xf32>) attrs = {__tuning_spec_applied__, lowering_config = #iree_gpu.lowering_config<{lowering_strategy = "cast_and_call_expanded_f8_pingpong_matmul", workgroup = [1, 256, 256, 0]}>} { | |
^bb0(%in: f8E4M3FNUZ, %in_11: f8E4M3FNUZ, %out: f32): | |
%24 = arith.extf %in : f8E4M3FNUZ to f32 | |
%25 = arith.extf %in_11 : f8E4M3FNUZ to f32 | |
%26 = arith.mulf %24, %25 : f32 | |
%27 = arith.addf %out, %26 : f32 | |
linalg.yield %27 : f32 | |
} -> tensor<1x256x256xf32> | |
%cast = tensor.cast %extracted_slice : tensor<1x256x4096xf8E4M3FNUZ> to tensor<1x256x?xf8E4M3FNUZ> | |
%cast_2 = tensor.cast %extracted_slice_0 : tensor<256x4096xf8E4M3FNUZ> to tensor<256x?xf8E4M3FNUZ> | |
%c0_3 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c32 = arith.constant 32 : index | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst_4 = arith.constant 0.000000e+00 : f8E4M3FNUZ | |
%alloc = memref.alloc() : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
%alloc_5 = memref.alloc() : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
%dim = tensor.dim %cast_2, %c1 : tensor<256x?xf8E4M3FNUZ> | |
%18 = iree_gpu.buffer_resource_cast %cast cacheSwizzleStride(%dim) : tensor<1x256x?xf8E4M3FNUZ> | |
%19 = iree_gpu.buffer_resource_cast %cast_2 cacheSwizzleStride(%dim) : tensor<256x?xf8E4M3FNUZ> | |
%20 = iree_codegen.swizzle_hint %alloc[#iree_codegen.rotate_rows<128, 8>] : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
%21 = iree_codegen.swizzle_hint %alloc_5[#iree_codegen.rotate_rows<128, 8>] : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
%expand_shape = memref.expand_shape %20 [[0, 1]] output_shape [256, 128] : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>> into memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
%expand_shape_6 = memref.expand_shape %21 [[0, 1]] output_shape [256, 128] : memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>> into memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
%extracted_slice_7 = tensor.extract_slice %18[0, 0, 0] [1, 256, 128] [1, 1, 1] : tensor<1x256x?xf8E4M3FNUZ> to tensor<1x256x128xf8E4M3FNUZ> | |
%extracted_slice_8 = tensor.extract_slice %19[0, 0] [256, 128] [1, 1] : tensor<256x?xf8E4M3FNUZ> to tensor<256x128xf8E4M3FNUZ> | |
scf.forall (%arg3) in (2048) { | |
%24:2 = affine.delinearize_index %arg3 into (256, 8) : index, index | |
%25 = arith.muli %24#1, %c8 : index | |
%extracted_slice_11 = tensor.extract_slice %extracted_slice_7[0, %24#0, %25] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ> | |
%26 = vector.transfer_read %extracted_slice_11[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ> | |
vector.transfer_write %26, %expand_shape[%24#0, %25] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
scf.forall (%arg3) in (2048) { | |
%24:2 = affine.delinearize_index %arg3 into (256, 8) : index, index | |
%25 = arith.muli %24#1, %c8 : index | |
%extracted_slice_11 = tensor.extract_slice %extracted_slice_8[%24#0, %25] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ> | |
%26 = vector.transfer_read %extracted_slice_11[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ> | |
vector.transfer_write %26, %expand_shape_6[%24#0, %25] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
%expand_shape_9 = memref.expand_shape %expand_shape [[0, 1], [2, 3]] output_shape [16, 16, 4, 32] : memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> into memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
%expand_shape_10 = memref.expand_shape %expand_shape_6 [[0, 1], [2, 3]] output_shape [16, 16, 4, 32] : memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> into memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
%22 = tensor.empty() : tensor<1x16x16x16x16xf32> | |
%23 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %22) -> (tensor<1x16x16x16x16xf32>) { | |
%24:4 = affine.delinearize_index %arg3 into (2, 4, 4, 16) : index, index, index, index | |
%25 = arith.muli %24#2, %c8 : index | |
%26 = arith.muli %24#0, %c8 : index | |
%27 = arith.muli %24#1, %c4 : index | |
%28:2 = affine.delinearize_index %arg3 into (64, 8) : index, index | |
%29:3 = affine.delinearize_index %arg3 into (8, 8, 8) : index, index, index | |
%30 = arith.muli %29#2, %c16 : index | |
%31 = arith.muli %29#0, %c32 : index | |
%32 = arith.addi %29#1, %31 : index | |
%33 = arith.addi %32, %c8 : index | |
%34 = arith.addi %33, %c8 : index | |
%35 = arith.addi %34, %c8 : index | |
%cst_11 = arith.constant dense<0.000000e+00> : vector<8x4x1x4xf32> | |
%36 = arith.cmpi slt, %arg3, %c256 : index | |
%37 = arith.cmpi sge, %arg3, %c256 : index | |
scf.if %36 { | |
rocdl.s.barrier | |
} | |
%38 = scf.for %arg5 = %c128 to %dim step %c128 iter_args(%arg6 = %cst_11) -> (vector<8x4x1x4xf32>) { | |
%extracted_slice_12 = tensor.extract_slice %18[0, 0, %arg5] [1, 256, 128] [1, 1, 1] : tensor<1x256x?xf8E4M3FNUZ> to tensor<1x256x128xf8E4M3FNUZ> | |
%extracted_slice_13 = tensor.extract_slice %extracted_slice_12[0, %32, %30] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ> | |
%54 = vector.transfer_read %extracted_slice_13[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ> | |
%extracted_slice_14 = tensor.extract_slice %extracted_slice_12[0, %33, %30] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ> | |
%55 = vector.transfer_read %extracted_slice_14[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ> | |
%extracted_slice_15 = tensor.extract_slice %extracted_slice_12[0, %34, %30] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ> | |
%56 = vector.transfer_read %extracted_slice_15[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ> | |
%extracted_slice_16 = tensor.extract_slice %extracted_slice_12[0, %35, %30] [1, 1, 16] [1, 1, 1] : tensor<1x256x128xf8E4M3FNUZ> to tensor<1x1x16xf8E4M3FNUZ> | |
%57 = vector.transfer_read %extracted_slice_16[%c0_3, %c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ> | |
%58 = vector.transfer_read %expand_shape_9[%26, %24#3, %c0_3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ> | |
%59 = vector.transfer_read %expand_shape_10[%27, %24#3, %c0_3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ> | |
gpu.barrier | |
rocdl.sched.barrier 0 | |
rocdl.s.setprio 1 {iree_gpu.swap_mfma = 1 : i64} | |
%60 = iree_gpu.multi_mma %58, %59, %arg6 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
gpu.barrier | |
rocdl.sched.barrier 0 | |
%extracted_slice_17 = tensor.extract_slice %19[0, %arg5] [256, 128] [1, 1] : tensor<256x?xf8E4M3FNUZ> to tensor<256x128xf8E4M3FNUZ> | |
%extracted_slice_18 = tensor.extract_slice %extracted_slice_17[%32, %30] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ> | |
%61 = vector.transfer_read %extracted_slice_18[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ> | |
%extracted_slice_19 = tensor.extract_slice %extracted_slice_17[%33, %30] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ> | |
%62 = vector.transfer_read %extracted_slice_19[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ> | |
%extracted_slice_20 = tensor.extract_slice %extracted_slice_17[%34, %30] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ> | |
%63 = vector.transfer_read %extracted_slice_20[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ> | |
%extracted_slice_21 = tensor.extract_slice %extracted_slice_17[%35, %30] [1, 16] [1, 1] : tensor<256x128xf8E4M3FNUZ> to tensor<1x16xf8E4M3FNUZ> | |
%64 = vector.transfer_read %extracted_slice_21[%c0_3, %c0_3], %cst_4 {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ> | |
%65 = vector.transfer_read %expand_shape_9[%26, %24#3, %c1, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ> | |
%66 = vector.transfer_read %expand_shape_10[%27, %24#3, %c1, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ> | |
gpu.barrier | |
rocdl.sched.barrier 0 | |
rocdl.s.setprio 1 {iree_gpu.swap_mfma = 1 : i64} | |
%67 = iree_gpu.multi_mma %65, %66, %60 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
gpu.barrier | |
rocdl.sched.barrier 0 | |
%68 = vector.transfer_read %expand_shape_9[%26, %24#3, %c2, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ> | |
%69 = vector.transfer_read %expand_shape_10[%27, %24#3, %c2, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ> | |
%70 = vector.transfer_read %expand_shape_9[%26, %24#3, %c3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ> | |
%71 = vector.transfer_read %expand_shape_10[%27, %24#3, %c3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ> | |
gpu.barrier | |
rocdl.sched.barrier 0 | |
rocdl.s.setprio 1 {iree_gpu.swap_mfma = 1 : i64} | |
%72 = iree_gpu.multi_mma %68, %69, %67 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
gpu.barrier | |
rocdl.sched.barrier 0 | |
vector.transfer_write %54, %expand_shape[%32, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
vector.transfer_write %55, %expand_shape[%33, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
vector.transfer_write %56, %expand_shape[%34, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
vector.transfer_write %57, %expand_shape[%35, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
vector.transfer_write %61, %expand_shape_6[%32, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
vector.transfer_write %62, %expand_shape_6[%33, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
vector.transfer_write %63, %expand_shape_6[%34, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
vector.transfer_write %64, %expand_shape_6[%35, %30] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
gpu.barrier | |
rocdl.sched.barrier 0 | |
rocdl.s.setprio 1 {iree_gpu.swap_mfma = 1 : i64} | |
%73 = iree_gpu.multi_mma %70, %71, %72 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
gpu.barrier | |
rocdl.sched.barrier 0 | |
scf.yield %73 : vector<8x4x1x4xf32> | |
} | |
scf.if %37 { | |
rocdl.s.barrier | |
} | |
%39 = vector.transfer_read %expand_shape_9[%26, %24#3, %c0_3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ> | |
%40 = vector.transfer_read %expand_shape_10[%27, %24#3, %c0_3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ> | |
%41 = iree_gpu.multi_mma %39, %40, %38 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32> | |
%42 = vector.transfer_read %expand_shape_9[%26, %24#3, %c1, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ> | |
%43 = vector.transfer_read %expand_shape_10[%27, %24#3, %c1, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ> | |
%44 = iree_gpu.multi_mma %42, %43, %41 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32> | |
%45 = vector.transfer_read %expand_shape_9[%26, %24#3, %c2, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ> | |
%46 = vector.transfer_read %expand_shape_10[%27, %24#3, %c2, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ> | |
%47 = iree_gpu.multi_mma %45, %46, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32> | |
%48 = vector.transfer_read %expand_shape_9[%26, %24#3, %c3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<8x1x1x8xf8E4M3FNUZ> | |
%49 = vector.transfer_read %expand_shape_10[%27, %24#3, %c3, %25], %cst_4 {in_bounds = [true, true, true, true]} : memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>, vector<4x1x1x8xf8E4M3FNUZ> | |
%50 = iree_gpu.multi_mma %48, %49, %47 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32> | |
%51 = vector.transpose %50, [0, 2, 1, 3] : vector<8x4x1x4xf32> to vector<8x1x4x4xf32> | |
%52 = tensor.empty() : tensor<1x8x1x4x4xf32> | |
%53 = vector.transfer_write %51, %52[%c0_3, %c0_3, %c0_3, %c0_3, %c0_3] {in_bounds = [true, true, true, true]} : vector<8x1x4x4xf32>, tensor<1x8x1x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %53 into %arg4[0, %26, %24#3, %27, %25] [1, 8, 1, 4, 4] [1, 1, 1, 1, 1] : tensor<1x8x1x4x4xf32> into tensor<1x16x16x16x16xf32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
%collapsed = tensor.collapse_shape %23 [[0], [1, 2], [3, 4]] : tensor<1x16x16x16x16xf32> into tensor<1x256x256xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %collapsed into %arg2[%arg0, 0, %arg1] [1, 256, 256] [1, 1, 1] : tensor<1x256x256xf32> into tensor<?x256x4096xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %15, %11, offsets = [0, 0, 0], sizes = [%9, 256, 4096], strides = [1, 1, 1] : tensor<?x256x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x256x4096xf32>>{%9} | |
return | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment