Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created January 22, 2025 16:16
Show Gist options
  • Save pashu123/2885238f8da6d4c39d2061717bb1b084 to your computer and use it in GitHub Desktop.
Save pashu123/2885238f8da6d4c39d2061717bb1b084 to your computer and use it in GitHub Desktop.
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<256x256xf16> to tensor<64x256xf16>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x256xf16> to tensor<256x64xf16>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xf16>, tensor<256x64xf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<256x256xf16> to tensor<64x256xf16>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x256xf16> to tensor<256x64xf16>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xf16>, tensor<256x64xf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<256x256xf16> to tensor<64x256xf16>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x256xf16> to tensor<256x64xf16>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xf16>, tensor<256x64xf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After ConvertAttentionToOnlineAttentionPass (iree-linalg-ext-convert-attention-to-online-attention) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<256x256xf16> to tensor<64x256xf16>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x256xf16> to tensor<256x64xf16>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xf16>, tensor<256x64xf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<256x256xf16> to tensor<64x256xf16>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x256xf16> to tensor<256x64xf16>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xf16>, tensor<256x64xf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<256x256xf16> to tensor<64x256xf16>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x256xf16> to tensor<256x64xf16>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xf16>, tensor<256x64xf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After GPUPromoteMatmulOperandsPass (iree-codegen-gpu-promote-matmul-operands) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<256x256xf16> to tensor<64x256xf16>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x256xf16> to tensor<256x64xf16>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = tensor.empty() : tensor<64x256xf16>
%9 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<64x256xf16>) outs(%8 : tensor<64x256xf16>) -> tensor<64x256xf16>
%10 = tensor.empty() : tensor<256x64xf16>
%11 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<256x64xf16>) outs(%10 : tensor<256x64xf16>) -> tensor<256x64xf16>
%12 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%9, %11 : tensor<64x256xf16>, tensor<256x64xf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%9 = tensor.empty() : tensor<64x128xf16>
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%11 = tensor.empty() : tensor<128x64xf16>
%12 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%11 : tensor<128x64xf16>) -> tensor<128x64xf16>
%13 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%10, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%arg4 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.yield %13 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After LoopCoalescing (affine-loop-coalescing) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%9 = tensor.empty() : tensor<64x128xf16>
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%11 = tensor.empty() : tensor<128x64xf16>
%12 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%11 : tensor<128x64xf16>) -> tensor<128x64xf16>
%13 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%10, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%arg4 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.yield %13 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%9 = tensor.empty() : tensor<64x128xf16>
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%11 = tensor.empty() : tensor<128x64xf16>
%12 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%11 : tensor<128x64xf16>) -> tensor<128x64xf16>
%13 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%10, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%arg4 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.yield %13 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%9 = tensor.empty() : tensor<64x128xf16>
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%11 = tensor.empty() : tensor<128x64xf16>
%12 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%11 : tensor<128x64xf16>) -> tensor<128x64xf16>
%13 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%10, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%arg4 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.yield %13 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After DecomposeAttentionPass (iree-linalg-ext-decompose-attention) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%9 = tensor.empty() : tensor<64x128xf16>
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%11 = tensor.empty() : tensor<128x64xf16>
%12 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%11 : tensor<128x64xf16>) -> tensor<128x64xf16>
%13 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%10, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%arg4 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.yield %13 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%9 = tensor.empty() : tensor<64x128xf16>
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%11 = tensor.empty() : tensor<128x64xf16>
%12 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%11 : tensor<128x64xf16>) -> tensor<128x64xf16>
%13 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%10, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%arg4 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.yield %13 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%9 = tensor.empty() : tensor<64x128xf16>
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%11 = tensor.empty() : tensor<128x64xf16>
%12 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%11 : tensor<128x64xf16>) -> tensor<128x64xf16>
%13 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%10, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%arg4 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.yield %13 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After LLVMGPUConfigureTensorLayoutsPass (iree-llvmgpu-configure-tensor-layouts) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%9 = tensor.empty() : tensor<64x128xf16>
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%12 = tensor.empty() : tensor<128x64xf16>
%13 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%12 : tensor<128x64xf16>) -> tensor<128x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16>
%15 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16>
%16 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16>
%17 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%18 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%15, %16 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%17 : tensor<64x64xf32>) -> tensor<64x64xf32>
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
scf.yield %19 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32>
%8 = tensor.empty() : tensor<64x128xf16>
%9 = tensor.empty() : tensor<128x64xf16>
%10 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%11 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%8 : tensor<64x128xf16>) -> tensor<64x128xf16>
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%13 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%9 : tensor<128x64xf16>) -> tensor<128x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16>
%15 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16>
%16 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16>
%17 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%18 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%15, %16 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%17 : tensor<64x64xf32>) -> tensor<64x64xf32>
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
scf.yield %19 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After LinalgGeneralizeNamedOpsPass (linalg-generalize-named-ops) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%8 = tensor.empty() : tensor<64x128xf16>
%9 = tensor.empty() : tensor<128x64xf16>
%10 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%8 : tensor<64x128xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<64x128xf16>
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%9 : tensor<128x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<128x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16>
%15 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16>
%16 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16>
%17 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%15, %16 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%17 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_2: f16, %out: f32):
%20 = arith.extf %in : f16 to f32
%21 = arith.extf %in_2 : f16 to f32
%22 = arith.mulf %20, %21 : f32
%23 = arith.addf %out, %22 : f32
linalg.yield %23 : f32
} -> tensor<64x64xf32>
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
scf.yield %19 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After VectorExtFoldUnitExtentDimsPass (iree-vector-ext-fold-unit-extent-dims) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%8 = tensor.empty() : tensor<64x128xf16>
%9 = tensor.empty() : tensor<128x64xf16>
%10 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%8 : tensor<64x128xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<64x128xf16>
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%9 : tensor<128x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<128x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16>
%15 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16>
%16 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16>
%17 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%15, %16 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%17 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_2: f16, %out: f32):
%20 = arith.extf %in : f16 to f32
%21 = arith.extf %in_2 : f16 to f32
%22 = arith.mulf %20, %21 : f32
%23 = arith.addf %out, %22 : f32
linalg.yield %23 : f32
} -> tensor<64x64xf32>
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
scf.yield %19 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After LinalgFoldUnitExtentDimsPass (linalg-fold-unit-extent-dims) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%8 = tensor.empty() : tensor<64x128xf16>
%9 = tensor.empty() : tensor<128x64xf16>
%10 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%8 : tensor<64x128xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<64x128xf16>
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%9 : tensor<128x64xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<128x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16>
%15 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16>
%16 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16>
%17 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%15, %16 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%17 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_2: f16, %out: f32):
%20 = arith.extf %in : f16 to f32
%21 = arith.extf %in_2 : f16 to f32
%22 = arith.mulf %20, %21 : f32
%23 = arith.addf %out, %22 : f32
linalg.yield %23 : f32
} -> tensor<64x64xf32>
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
scf.yield %19 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%9 = iree_vector_ext.to_layout %extracted_slice_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%10 = iree_vector_ext.to_layout %extracted_slice_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16>
%11 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16>
%12 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16>
%13 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%13 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_2: f16, %out: f32):
%16 = arith.extf %in : f16 to f32
%17 = arith.extf %in_2 : f16 to f32
%18 = arith.mulf %16, %17 : f32
%19 = arith.addf %out, %18 : f32
linalg.yield %19 : f32
} -> tensor<64x64xf32>
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
scf.yield %15 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%9 = iree_vector_ext.to_layout %extracted_slice_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%10 = iree_vector_ext.to_layout %extracted_slice_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16>
%11 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16>
%12 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16>
%13 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%13 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_2: f16, %out: f32):
%16 = arith.extf %in : f16 to f32
%17 = arith.extf %in_2 : f16 to f32
%18 = arith.mulf %16, %17 : f32
%19 = arith.addf %out, %18 : f32
linalg.yield %19 : f32
} -> tensor<64x64xf32>
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
scf.yield %15 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%9 = iree_vector_ext.to_layout %extracted_slice_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16>
%10 = iree_vector_ext.to_layout %extracted_slice_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16>
%11 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16>
%12 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16>
%13 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%13 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_2: f16, %out: f32):
%16 = arith.extf %in : f16 to f32
%17 = arith.extf %in_2 : f16 to f32
%18 = arith.mulf %16, %17 : f32
%19 = arith.addf %out, %18 : f32
linalg.yield %19 : f32
} -> tensor<64x64xf32>
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
scf.yield %15 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%9 = iree_vector_ext.to_layout %extracted_slice_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16>
%10 = iree_vector_ext.to_layout %extracted_slice_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16>
%11 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16>
%12 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16>
%13 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%13 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_2: f16, %out: f32):
%16 = arith.extf %in : f16 to f32
%17 = arith.extf %in_2 : f16 to f32
%18 = arith.mulf %16, %17 : f32
%19 = arith.addf %out, %18 : f32
linalg.yield %19 : f32
} -> tensor<64x64xf32>
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
scf.yield %15 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After DecomposeIm2colPass (iree-linalg-ext-decompose-im2col) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%9 = iree_vector_ext.to_layout %extracted_slice_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16>
%10 = iree_vector_ext.to_layout %extracted_slice_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16>
%11 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16>
%12 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16>
%13 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%13 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_2: f16, %out: f32):
%16 = arith.extf %in : f16 to f32
%17 = arith.extf %in_2 : f16 to f32
%18 = arith.mulf %16, %17 : f32
%19 = arith.addf %out, %18 : f32
linalg.yield %19 : f32
} -> tensor<64x64xf32>
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32>
scf.yield %15 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After VectorizeIREEVectorExtOpsPass (iree-vector-ext-vectorize-ops) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_0 : f32) outs(%extracted_slice : tensor<64x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_1 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%extracted_slice_2 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%9 = vector.transfer_read %extracted_slice_1[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x128xf16>, vector<64x128xf16>
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%11 = vector.transfer_read %extracted_slice_2[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<128x64xf16>, vector<128x64xf16>
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%13 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x128xf16>
%14 = tensor.empty() : tensor<64x128xf16>
%15 = vector.transfer_write %13, %14[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, tensor<64x128xf16>
%16 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<128x64xf16>
%17 = tensor.empty() : tensor<128x64xf16>
%18 = vector.transfer_write %16, %17[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, tensor<128x64xf16>
%19 = vector.transfer_read %arg4[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%20 = iree_vector_ext.to_layout %19 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%21 = tensor.empty() : tensor<64x64xf32>
%22 = vector.transfer_write %20, %21[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%15, %18 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%22 : tensor<64x64xf32>) {
^bb0(%in: f16, %in_3: f16, %out: f32):
%28 = arith.extf %in : f16 to f32
%29 = arith.extf %in_3 : f16 to f32
%30 = arith.mulf %28, %29 : f32
%31 = arith.addf %out, %30 : f32
linalg.yield %31 : f32
} -> tensor<64x64xf32>
%24 = vector.transfer_read %23[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%26 = tensor.empty() : tensor<64x64xf32>
%27 = vector.transfer_write %25, %26[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.yield %27 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = vector.transfer_write %cst, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_2 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%extracted_slice_3 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%9 = vector.transfer_read %extracted_slice_2[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16>, vector<64x128xf16>
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%11 = vector.transfer_read %extracted_slice_3[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16>, vector<128x64xf16>
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%13 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x128xf16>
%14 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<128x64xf16>
%15 = vector.transfer_read %arg4[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %16 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%19 = tensor.empty() : tensor<64x64xf32>
%20 = vector.transfer_write %18, %19[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.yield %20 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = vector.transfer_write %cst, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_2 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%extracted_slice_3 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%9 = vector.transfer_read %extracted_slice_2[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16>, vector<64x128xf16>
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%11 = vector.transfer_read %extracted_slice_3[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16>, vector<128x64xf16>
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%13 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x128xf16>
%14 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<128x64xf16>
%15 = vector.transfer_read %arg4[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %16 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%19 = tensor.empty() : tensor<64x64xf32>
%20 = vector.transfer_write %18, %19[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.yield %20 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = vector.transfer_write %cst, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) {
%extracted_slice_2 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16>
%extracted_slice_3 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16>
%9 = vector.transfer_read %extracted_slice_2[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16>, vector<64x128xf16>
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%11 = vector.transfer_read %extracted_slice_3[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16>, vector<128x64xf16>
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%13 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x128xf16>
%14 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<128x64xf16>
%15 = vector.transfer_read %arg4[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32>
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %16 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%19 = tensor.empty() : tensor<64x64xf32>
%20 = vector.transfer_write %18, %19[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.yield %20 : tensor<64x64xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%7 = tensor.empty() : tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) {
%10 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%12 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16>
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%14 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x128xf16>
%15 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<128x64xf16>
%16 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %18 : vector<64x64xf32>
}
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%7 = tensor.empty() : tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) {
%10 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%12 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16>
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%14 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x128xf16>
%15 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<128x64xf16>
%16 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %18 : vector<64x64xf32>
}
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%7 = tensor.empty() : tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) {
%10 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%12 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16>
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%14 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x128xf16>
%15 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<128x64xf16>
%16 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %18 : vector<64x64xf32>
}
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After GPUVectorAllocPass (iree-codegen-gpu-vector-alloc) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%7 = tensor.empty() : tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
gpu.barrier
%10 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%12 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16>
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x128xf16, #gpu.address_space<workgroup>>
%c0_1 = arith.constant 0 : index
%15 = vector.transfer_write %11, %14[%c0_1, %c0_1] {in_bounds = [true, true]} : vector<64x128xf16>, tensor<64x128xf16, #gpu.address_space<workgroup>>
%16 = iree_gpu.value_barrier %15 : tensor<64x128xf16, #gpu.address_space<workgroup>>
%c0_2 = arith.constant 0 : index
%cst_3 = arith.constant 0.000000e+00 : f16
%17 = vector.transfer_read %16[%c0_2, %c0_2], %cst_3 {in_bounds = [true, true]} : tensor<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x64xf16, #gpu.address_space<workgroup>>
%c0_4 = arith.constant 0 : index
%20 = vector.transfer_write %13, %19[%c0_4, %c0_4] {in_bounds = [true, true]} : vector<128x64xf16>, tensor<128x64xf16, #gpu.address_space<workgroup>>
%21 = iree_gpu.value_barrier %20 : tensor<128x64xf16, #gpu.address_space<workgroup>>
%c0_5 = arith.constant 0 : index
%cst_6 = arith.constant 0.000000e+00 : f16
%22 = vector.transfer_read %21[%c0_5, %c0_5], %cst_6 {in_bounds = [true, true]} : tensor<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%24 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%25 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %23, %24 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %26 : vector<64x64xf32>
}
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%7 = tensor.empty() : tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%10 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%12 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16>
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x128xf16, #gpu.address_space<workgroup>>
%15 = vector.transfer_write %11, %14[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, tensor<64x128xf16, #gpu.address_space<workgroup>>
%16 = iree_gpu.value_barrier %15 : tensor<64x128xf16, #gpu.address_space<workgroup>>
%17 = vector.transfer_read %16[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x64xf16, #gpu.address_space<workgroup>>
%20 = vector.transfer_write %13, %19[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, tensor<128x64xf16, #gpu.address_space<workgroup>>
%21 = iree_gpu.value_barrier %20 : tensor<128x64xf16, #gpu.address_space<workgroup>>
%22 = vector.transfer_read %21[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%24 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%25 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %23, %24 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %26 : vector<64x64xf32>
}
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%7 = tensor.empty() : tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%10 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%12 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16>
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x128xf16, #gpu.address_space<workgroup>>
%15 = vector.transfer_write %11, %14[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, tensor<64x128xf16, #gpu.address_space<workgroup>>
%16 = iree_gpu.value_barrier %15 : tensor<64x128xf16, #gpu.address_space<workgroup>>
%17 = vector.transfer_read %16[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x64xf16, #gpu.address_space<workgroup>>
%20 = vector.transfer_write %13, %19[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, tensor<128x64xf16, #gpu.address_space<workgroup>>
%21 = iree_gpu.value_barrier %20 : tensor<128x64xf16, #gpu.address_space<workgroup>>
%22 = vector.transfer_read %21[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%24 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%25 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %23, %24 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %26 : vector<64x64xf32>
}
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After GPUCombineValueBarriersPass (iree-codegen-gpu-combine-value-barriers) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%7 = tensor.empty() : tensor<64x64xf32>
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%10 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%12 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16>
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x128xf16, #gpu.address_space<workgroup>>
%15 = vector.transfer_write %11, %14[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, tensor<64x128xf16, #gpu.address_space<workgroup>>
%16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x64xf16, #gpu.address_space<workgroup>>
%17 = vector.transfer_write %13, %16[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, tensor<128x64xf16, #gpu.address_space<workgroup>>
%18:2 = iree_gpu.value_barrier %15, %17 : tensor<64x128xf16, #gpu.address_space<workgroup>>, tensor<128x64xf16, #gpu.address_space<workgroup>>
%19 = vector.transfer_read %18#0[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%20 = iree_vector_ext.to_layout %19 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%21 = vector.transfer_read %18#1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%22 = iree_vector_ext.to_layout %21 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%23 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %20, %22, %23 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %25 : vector<64x64xf32>
}
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> -> tensor<256x256xf32>
%6 = tensor.empty() : tensor<256x256xf32>
%7 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%8 = tensor.empty() : tensor<64x64xf32>
%9 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%11 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16>
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%13 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x128xf16, #gpu.address_space<workgroup>>
%16 = vector.transfer_write %12, %15[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, tensor<64x128xf16, #gpu.address_space<workgroup>>
%17 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x64xf16, #gpu.address_space<workgroup>>
%18 = vector.transfer_write %14, %17[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, tensor<128x64xf16, #gpu.address_space<workgroup>>
%19:2 = iree_gpu.value_barrier %16, %18 : tensor<64x128xf16, #gpu.address_space<workgroup>>, tensor<128x64xf16, #gpu.address_space<workgroup>>
%20 = vector.transfer_read %19#0[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%21 = iree_vector_ext.to_layout %20 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%22 = vector.transfer_read %19#1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%24 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%25 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %24 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %26 : vector<64x64xf32>
}
%10 = vector.transfer_write %9, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> -> tensor<256x256xf32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32>
%7 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%9 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16>
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%11 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16>
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x128xf16, #gpu.address_space<workgroup>>
%14 = vector.transfer_write %10, %13[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, tensor<64x128xf16, #gpu.address_space<workgroup>>
%15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x64xf16, #gpu.address_space<workgroup>>
%16 = vector.transfer_write %12, %15[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, tensor<128x64xf16, #gpu.address_space<workgroup>>
%17:2 = iree_gpu.value_barrier %14, %16 : tensor<64x128xf16, #gpu.address_space<workgroup>>, tensor<128x64xf16, #gpu.address_space<workgroup>>
%18 = vector.transfer_read %17#0[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%20 = vector.transfer_read %17#1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%21 = iree_vector_ext.to_layout %20 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%22 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%23 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %19, %21, %22 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %24 : vector<64x64xf32>
}
%8 = vector.transfer_write %7, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16>
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16>
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %7, %alloc_2[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%10 = vector.transfer_read %alloc_2[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %14 : vector<64x64xf32>
}
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview, %subview_1 : memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.copy %2, %2 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16>
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16>
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %7, %alloc_2[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%10 = vector.transfer_read %alloc_2[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %14 : vector<64x64xf32>
}
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview, %subview_1 : memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.copy %2, %2 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16>
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16>
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %7, %alloc_2[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%10 = vector.transfer_read %alloc_2[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %14 : vector<64x64xf32>
}
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview, %subview_1 : memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16>
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16>
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %7, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%10 = vector.transfer_read %alloc_1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %14 : vector<64x64xf32>
}
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview, %subview : memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16>
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16>
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %7, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%10 = vector.transfer_read %alloc_1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %14 : vector<64x64xf32>
}
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16>
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16>
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %7, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%10 = vector.transfer_read %alloc_1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %14 : vector<64x64xf32>
}
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16>
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16>
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %7, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%10 = vector.transfer_read %alloc_1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %14 : vector<64x64xf32>
}
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16>
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16>
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %7, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%10 = vector.transfer_read %alloc_1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %14 : vector<64x64xf32>
}
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16>
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16>
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %7, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%10 = vector.transfer_read %alloc_1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %14 : vector<64x64xf32>
}
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16>
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16>
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
vector.transfer_write %7, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%10 = vector.transfer_read %alloc_1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %14 : vector<64x64xf32>
}
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After HoistStaticallyBoundAllocationsPass (iree-codegen-hoist-statically-bound-allocations) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%alloc = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%cst_1 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) {
gpu.barrier
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_1 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16>
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_1 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16>
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
vector.transfer_write %5, %alloc_0[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %7, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%8 = vector.transfer_read %alloc_0[%c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%10 = vector.transfer_read %alloc[%c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %14 : vector<64x64xf32>
}
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %alloc_0 : memref<64x128xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<128x64xf16, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After LLVMGPUCastTypeToFitMMAPass (iree-llvmgpu-cast-type-to-fit-mma) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%c0 = arith.constant 0 : index
%c256 = arith.constant 256 : index
%c128 = arith.constant 128 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
%alloc = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_0) -> (vector<64x64xf32>) {
gpu.barrier
%4 = vector.transfer_read %0[%arg0, %arg2], %cst {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16>
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16>
%6 = vector.transfer_read %1[%arg2, %arg1], %cst {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16>
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16>
vector.transfer_write %5, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
vector.transfer_write %7, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%8 = vector.transfer_read %alloc_1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16>
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>
%10 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16>
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16>
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 {iree.amdgpu.mma = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32>
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
scf.yield %14 : vector<64x64xf32>
}
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %alloc_1 : memref<64x128xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<128x64xf16, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After LLVMGPUVectorDistributePass (iree-llvmgpu-vector-distribute) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16>
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32>
%cst_2 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%thread_id_z = gpu.thread_id z
%thread_id_y = gpu.thread_id y
%thread_id_x = gpu.thread_id x
%0 = affine.linearize_index disjoint [%thread_id_z, %thread_id_y, %thread_id_x] by (1, 1, 256) : index
%alloc = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %3[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%4 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) {
gpu.barrier
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %0]
%18 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %0]
%19 = vector.transfer_read %1[%17, %18], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%20 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %0]
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %0]
%22 = vector.transfer_read %1[%20, %21], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %0]
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %0]
%25 = vector.transfer_read %1[%23, %24], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %0]
%27 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %0]
%28 = vector.transfer_read %1[%26, %27], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%29 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %0]
%30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %0]
%31 = vector.transfer_read %2[%29, %30], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%32 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %0]
%33 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %0]
%34 = vector.transfer_read %2[%32, %33], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%35 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %0]
%36 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %0]
%37 = vector.transfer_read %2[%35, %36], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%38 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %0]
%39 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %0]
%40 = vector.transfer_read %2[%38, %39], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%0]
%42 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%0]
vector.transfer_write %19, %alloc_3[%41, %42] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%43 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%0]
%44 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%0]
vector.transfer_write %22, %alloc_3[%43, %44] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%45 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%0]
%46 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%0]
vector.transfer_write %25, %alloc_3[%45, %46] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%0]
%48 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%0]
vector.transfer_write %28, %alloc_3[%47, %48] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%49 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%0]
%50 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
vector.transfer_write %31, %alloc[%49, %50] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
%51 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%0]
%52 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
vector.transfer_write %34, %alloc[%51, %52] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%0]
%54 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
vector.transfer_write %37, %alloc[%53, %54] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
%55 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%0]
%56 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0]
vector.transfer_write %40, %alloc[%55, %56] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%57 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%58 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%59 = vector.transfer_read %alloc_3[%57, %58], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%60 = vector.insert_strided_slice %59, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%61 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%63 = vector.transfer_read %alloc_3[%61, %62], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%64 = vector.insert_strided_slice %63, %60 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%66 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%67 = vector.transfer_read %alloc_3[%65, %66], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%68 = vector.insert_strided_slice %67, %64 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%69 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%70 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%71 = vector.transfer_read %alloc_3[%69, %70], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%72 = vector.insert_strided_slice %71, %68 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%73 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%74 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%0]
%75 = vector.transfer_read %alloc_3[%73, %74], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%76 = vector.insert_strided_slice %75, %72 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%77 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%78 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%0]
%79 = vector.transfer_read %alloc_3[%77, %78], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%80 = vector.insert_strided_slice %79, %76 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%81 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%82 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%0]
%83 = vector.transfer_read %alloc_3[%81, %82], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%84 = vector.insert_strided_slice %83, %80 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%86 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%0]
%87 = vector.transfer_read %alloc_3[%85, %86], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%88 = vector.insert_strided_slice %87, %84 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%89 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%90 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%91 = vector.transfer_read %alloc_3[%89, %90], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%92 = vector.insert_strided_slice %91, %88 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%93 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%94 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%95 = vector.transfer_read %alloc_3[%93, %94], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%96 = vector.insert_strided_slice %95, %92 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%97 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%98 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%99 = vector.transfer_read %alloc_3[%97, %98], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%100 = vector.insert_strided_slice %99, %96 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%101 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%102 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%103 = vector.transfer_read %alloc_3[%101, %102], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%104 = vector.insert_strided_slice %103, %100 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%105 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%106 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%0]
%107 = vector.transfer_read %alloc_3[%105, %106], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%108 = vector.insert_strided_slice %107, %104 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%109 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%110 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%0]
%111 = vector.transfer_read %alloc_3[%109, %110], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%112 = vector.insert_strided_slice %111, %108 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%113 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%114 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%0]
%115 = vector.transfer_read %alloc_3[%113, %114], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%116 = vector.insert_strided_slice %115, %112 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%117 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%118 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%0]
%119 = vector.transfer_read %alloc_3[%117, %118], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%120 = vector.insert_strided_slice %119, %116 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%121 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%122 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%123 = vector.transfer_read %alloc[%121, %122], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%124 = vector.insert_strided_slice %123, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%125 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%126 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%127 = vector.transfer_read %alloc[%125, %126], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%128 = vector.insert_strided_slice %127, %124 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%129 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%130 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%131 = vector.transfer_read %alloc[%129, %130], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%132 = vector.insert_strided_slice %131, %128 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%133 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%134 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%135 = vector.transfer_read %alloc[%133, %134], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%136 = vector.insert_strided_slice %135, %132 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%137 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%138 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%139 = vector.transfer_read %alloc[%137, %138], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%140 = vector.insert_strided_slice %139, %136 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%141 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0]
%142 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%143 = vector.transfer_read %alloc[%141, %142], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%144 = vector.insert_strided_slice %143, %140 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%145 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%146 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%147 = vector.transfer_read %alloc[%145, %146], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%148 = vector.insert_strided_slice %147, %144 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%149 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0]
%150 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%151 = vector.transfer_read %alloc[%149, %150], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%152 = vector.insert_strided_slice %151, %148 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%153 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%0]
%154 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%155 = vector.transfer_read %alloc[%153, %154], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%156 = vector.insert_strided_slice %155, %152 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%157 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%0]
%158 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%159 = vector.transfer_read %alloc[%157, %158], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%160 = vector.insert_strided_slice %159, %156 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%161 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%0]
%162 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%163 = vector.transfer_read %alloc[%161, %162], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%164 = vector.insert_strided_slice %163, %160 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%165 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%0]
%166 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%167 = vector.transfer_read %alloc[%165, %166], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%168 = vector.insert_strided_slice %167, %164 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%169 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%0]
%170 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%171 = vector.transfer_read %alloc[%169, %170], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%172 = vector.insert_strided_slice %171, %168 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%173 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%0]
%174 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%175 = vector.transfer_read %alloc[%173, %174], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%176 = vector.insert_strided_slice %175, %172 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%177 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%0]
%178 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%179 = vector.transfer_read %alloc[%177, %178], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%180 = vector.insert_strided_slice %179, %176 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%181 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%0]
%182 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%183 = vector.transfer_read %alloc[%181, %182], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%184 = vector.insert_strided_slice %183, %180 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%185 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%186 = vector.extract %120[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%187 = vector.extract %184[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%188 = vector.shape_cast %186 : vector<1x1x1x4xf16> to vector<4xf16>
%189 = vector.shape_cast %187 : vector<1x1x4x1xf16> to vector<4xf16>
%190 = vector.shape_cast %185 : vector<1x1x4x1xf32> to vector<4xf32>
%191 = amdgpu.mfma %188 * %189 + %190 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%192 = vector.extract %120[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%193 = vector.extract %184[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%194 = vector.shape_cast %192 : vector<1x1x1x4xf16> to vector<4xf16>
%195 = vector.shape_cast %193 : vector<1x1x4x1xf16> to vector<4xf16>
%196 = amdgpu.mfma %194 * %195 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%197 = vector.extract %120[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%198 = vector.extract %184[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%199 = vector.shape_cast %197 : vector<1x1x1x4xf16> to vector<4xf16>
%200 = vector.shape_cast %198 : vector<1x1x4x1xf16> to vector<4xf16>
%201 = amdgpu.mfma %199 * %200 + %196 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%202 = vector.extract %120[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%203 = vector.extract %184[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%204 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16>
%205 = vector.shape_cast %203 : vector<1x1x4x1xf16> to vector<4xf16>
%206 = amdgpu.mfma %204 * %205 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%207 = vector.extract %120[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%208 = vector.extract %184[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%209 = vector.shape_cast %207 : vector<1x1x1x4xf16> to vector<4xf16>
%210 = vector.shape_cast %208 : vector<1x1x4x1xf16> to vector<4xf16>
%211 = amdgpu.mfma %209 * %210 + %206 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%212 = vector.extract %120[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%213 = vector.extract %184[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%214 = vector.shape_cast %212 : vector<1x1x1x4xf16> to vector<4xf16>
%215 = vector.shape_cast %213 : vector<1x1x4x1xf16> to vector<4xf16>
%216 = amdgpu.mfma %214 * %215 + %211 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%217 = vector.extract %120[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%218 = vector.extract %184[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%219 = vector.shape_cast %217 : vector<1x1x1x4xf16> to vector<4xf16>
%220 = vector.shape_cast %218 : vector<1x1x4x1xf16> to vector<4xf16>
%221 = amdgpu.mfma %219 * %220 + %216 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%222 = vector.extract %120[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%223 = vector.extract %184[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%224 = vector.shape_cast %222 : vector<1x1x1x4xf16> to vector<4xf16>
%225 = vector.shape_cast %223 : vector<1x1x4x1xf16> to vector<4xf16>
%226 = amdgpu.mfma %224 * %225 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%227 = vector.shape_cast %226 : vector<4xf32> to vector<1x1x4x1xf32>
%228 = vector.insert %227, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%229 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%230 = vector.extract %120[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%231 = vector.extract %184[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%232 = vector.shape_cast %230 : vector<1x1x1x4xf16> to vector<4xf16>
%233 = vector.shape_cast %231 : vector<1x1x4x1xf16> to vector<4xf16>
%234 = vector.shape_cast %229 : vector<1x1x4x1xf32> to vector<4xf32>
%235 = amdgpu.mfma %232 * %233 + %234 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%236 = vector.extract %120[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%237 = vector.extract %184[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%238 = vector.shape_cast %236 : vector<1x1x1x4xf16> to vector<4xf16>
%239 = vector.shape_cast %237 : vector<1x1x4x1xf16> to vector<4xf16>
%240 = amdgpu.mfma %238 * %239 + %235 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%241 = vector.extract %120[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%242 = vector.extract %184[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%243 = vector.shape_cast %241 : vector<1x1x1x4xf16> to vector<4xf16>
%244 = vector.shape_cast %242 : vector<1x1x4x1xf16> to vector<4xf16>
%245 = amdgpu.mfma %243 * %244 + %240 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%246 = vector.extract %120[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%247 = vector.extract %184[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%248 = vector.shape_cast %246 : vector<1x1x1x4xf16> to vector<4xf16>
%249 = vector.shape_cast %247 : vector<1x1x4x1xf16> to vector<4xf16>
%250 = amdgpu.mfma %248 * %249 + %245 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%251 = vector.extract %120[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%252 = vector.extract %184[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%253 = vector.shape_cast %251 : vector<1x1x1x4xf16> to vector<4xf16>
%254 = vector.shape_cast %252 : vector<1x1x4x1xf16> to vector<4xf16>
%255 = amdgpu.mfma %253 * %254 + %250 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%256 = vector.extract %120[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%257 = vector.extract %184[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%258 = vector.shape_cast %256 : vector<1x1x1x4xf16> to vector<4xf16>
%259 = vector.shape_cast %257 : vector<1x1x4x1xf16> to vector<4xf16>
%260 = amdgpu.mfma %258 * %259 + %255 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%261 = vector.extract %120[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%262 = vector.extract %184[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%263 = vector.shape_cast %261 : vector<1x1x1x4xf16> to vector<4xf16>
%264 = vector.shape_cast %262 : vector<1x1x4x1xf16> to vector<4xf16>
%265 = amdgpu.mfma %263 * %264 + %260 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%266 = vector.extract %120[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%267 = vector.extract %184[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%268 = vector.shape_cast %266 : vector<1x1x1x4xf16> to vector<4xf16>
%269 = vector.shape_cast %267 : vector<1x1x4x1xf16> to vector<4xf16>
%270 = amdgpu.mfma %268 * %269 + %265 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%271 = vector.shape_cast %270 : vector<4xf32> to vector<1x1x4x1xf32>
%272 = vector.insert %271, %228 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%273 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%274 = vector.extract %120[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%275 = vector.extract %184[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%276 = vector.shape_cast %274 : vector<1x1x1x4xf16> to vector<4xf16>
%277 = vector.shape_cast %275 : vector<1x1x4x1xf16> to vector<4xf16>
%278 = vector.shape_cast %273 : vector<1x1x4x1xf32> to vector<4xf32>
%279 = amdgpu.mfma %276 * %277 + %278 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%280 = vector.extract %120[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%281 = vector.extract %184[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%282 = vector.shape_cast %280 : vector<1x1x1x4xf16> to vector<4xf16>
%283 = vector.shape_cast %281 : vector<1x1x4x1xf16> to vector<4xf16>
%284 = amdgpu.mfma %282 * %283 + %279 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%285 = vector.extract %120[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%286 = vector.extract %184[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%287 = vector.shape_cast %285 : vector<1x1x1x4xf16> to vector<4xf16>
%288 = vector.shape_cast %286 : vector<1x1x4x1xf16> to vector<4xf16>
%289 = amdgpu.mfma %287 * %288 + %284 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%290 = vector.extract %120[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%291 = vector.extract %184[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%292 = vector.shape_cast %290 : vector<1x1x1x4xf16> to vector<4xf16>
%293 = vector.shape_cast %291 : vector<1x1x4x1xf16> to vector<4xf16>
%294 = amdgpu.mfma %292 * %293 + %289 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%295 = vector.extract %120[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%296 = vector.extract %184[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%297 = vector.shape_cast %295 : vector<1x1x1x4xf16> to vector<4xf16>
%298 = vector.shape_cast %296 : vector<1x1x4x1xf16> to vector<4xf16>
%299 = amdgpu.mfma %297 * %298 + %294 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%300 = vector.extract %120[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%301 = vector.extract %184[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%302 = vector.shape_cast %300 : vector<1x1x1x4xf16> to vector<4xf16>
%303 = vector.shape_cast %301 : vector<1x1x4x1xf16> to vector<4xf16>
%304 = amdgpu.mfma %302 * %303 + %299 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%305 = vector.extract %120[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%306 = vector.extract %184[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%307 = vector.shape_cast %305 : vector<1x1x1x4xf16> to vector<4xf16>
%308 = vector.shape_cast %306 : vector<1x1x4x1xf16> to vector<4xf16>
%309 = amdgpu.mfma %307 * %308 + %304 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%310 = vector.extract %120[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%311 = vector.extract %184[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%312 = vector.shape_cast %310 : vector<1x1x1x4xf16> to vector<4xf16>
%313 = vector.shape_cast %311 : vector<1x1x4x1xf16> to vector<4xf16>
%314 = amdgpu.mfma %312 * %313 + %309 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%315 = vector.shape_cast %314 : vector<4xf32> to vector<1x1x4x1xf32>
%316 = vector.insert %315, %272 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%317 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%318 = vector.extract %120[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%319 = vector.extract %184[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%320 = vector.shape_cast %318 : vector<1x1x1x4xf16> to vector<4xf16>
%321 = vector.shape_cast %319 : vector<1x1x4x1xf16> to vector<4xf16>
%322 = vector.shape_cast %317 : vector<1x1x4x1xf32> to vector<4xf32>
%323 = amdgpu.mfma %320 * %321 + %322 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%324 = vector.extract %120[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%325 = vector.extract %184[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%326 = vector.shape_cast %324 : vector<1x1x1x4xf16> to vector<4xf16>
%327 = vector.shape_cast %325 : vector<1x1x4x1xf16> to vector<4xf16>
%328 = amdgpu.mfma %326 * %327 + %323 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%329 = vector.extract %120[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%330 = vector.extract %184[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%331 = vector.shape_cast %329 : vector<1x1x1x4xf16> to vector<4xf16>
%332 = vector.shape_cast %330 : vector<1x1x4x1xf16> to vector<4xf16>
%333 = amdgpu.mfma %331 * %332 + %328 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%334 = vector.extract %120[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%335 = vector.extract %184[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%336 = vector.shape_cast %334 : vector<1x1x1x4xf16> to vector<4xf16>
%337 = vector.shape_cast %335 : vector<1x1x4x1xf16> to vector<4xf16>
%338 = amdgpu.mfma %336 * %337 + %333 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%339 = vector.extract %120[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%340 = vector.extract %184[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%341 = vector.shape_cast %339 : vector<1x1x1x4xf16> to vector<4xf16>
%342 = vector.shape_cast %340 : vector<1x1x4x1xf16> to vector<4xf16>
%343 = amdgpu.mfma %341 * %342 + %338 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%344 = vector.extract %120[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%345 = vector.extract %184[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%346 = vector.shape_cast %344 : vector<1x1x1x4xf16> to vector<4xf16>
%347 = vector.shape_cast %345 : vector<1x1x4x1xf16> to vector<4xf16>
%348 = amdgpu.mfma %346 * %347 + %343 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%349 = vector.extract %120[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%350 = vector.extract %184[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%351 = vector.shape_cast %349 : vector<1x1x1x4xf16> to vector<4xf16>
%352 = vector.shape_cast %350 : vector<1x1x4x1xf16> to vector<4xf16>
%353 = amdgpu.mfma %351 * %352 + %348 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%354 = vector.extract %120[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%355 = vector.extract %184[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%356 = vector.shape_cast %354 : vector<1x1x1x4xf16> to vector<4xf16>
%357 = vector.shape_cast %355 : vector<1x1x4x1xf16> to vector<4xf16>
%358 = amdgpu.mfma %356 * %357 + %353 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%359 = vector.shape_cast %358 : vector<4xf32> to vector<1x1x4x1xf32>
%360 = vector.insert %359, %316 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
scf.yield %360 : vector<2x2x1x1x4x1xf32>
}
%5 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%6 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%7 = vector.extract %4[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %7, %subview[%5, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0]
%9 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%10 = vector.extract %4[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %10, %subview[%8, %9] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%12 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0]
%13 = vector.extract %4[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %13, %subview[%11, %12] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%14 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0]
%15 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0]
%16 = vector.extract %4[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %16, %subview[%14, %15] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %alloc_3 : memref<64x128xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<128x64xf16, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16>
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32>
%cst_2 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%alloc = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) {
gpu.barrier
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x]
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x]
%18 = vector.transfer_read %0[%16, %17], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x]
%20 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x]
%21 = vector.transfer_read %0[%19, %20], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x]
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x]
%24 = vector.transfer_read %0[%22, %23], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%25 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x]
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x]
%27 = vector.transfer_read %0[%25, %26], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x]
%29 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x]
%30 = vector.transfer_read %1[%28, %29], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%31 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x]
%32 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x]
%33 = vector.transfer_read %1[%31, %32], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%34 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x]
%35 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x]
%36 = vector.transfer_read %1[%34, %35], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%37 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x]
%38 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x]
%39 = vector.transfer_read %1[%37, %38], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%40 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x]
%41 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %18, %alloc_3[%40, %41] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%42 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x]
%43 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %21, %alloc_3[%42, %43] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x]
%45 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %24, %alloc_3[%44, %45] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%46 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x]
%47 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %27, %alloc_3[%46, %47] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%48 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x]
%49 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %30, %alloc[%48, %49] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x]
%51 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %33, %alloc[%50, %51] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
%52 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x]
%53 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %36, %alloc[%52, %53] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
%54 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x]
%55 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %39, %alloc[%54, %55] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%56 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%57 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%58 = vector.transfer_read %alloc_3[%56, %57], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%59 = vector.insert_strided_slice %58, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%60 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%61 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%62 = vector.transfer_read %alloc_3[%60, %61], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%63 = vector.insert_strided_slice %62, %59 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%64 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%65 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%66 = vector.transfer_read %alloc_3[%64, %65], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%67 = vector.insert_strided_slice %66, %63 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%68 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%69 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%70 = vector.transfer_read %alloc_3[%68, %69], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%71 = vector.insert_strided_slice %70, %67 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%72 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%73 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%74 = vector.transfer_read %alloc_3[%72, %73], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%75 = vector.insert_strided_slice %74, %71 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%76 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%77 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%78 = vector.transfer_read %alloc_3[%76, %77], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%79 = vector.insert_strided_slice %78, %75 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%80 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%81 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%82 = vector.transfer_read %alloc_3[%80, %81], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%83 = vector.insert_strided_slice %82, %79 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%84 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%85 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%86 = vector.transfer_read %alloc_3[%84, %85], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%87 = vector.insert_strided_slice %86, %83 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%88 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%89 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%90 = vector.transfer_read %alloc_3[%88, %89], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%91 = vector.insert_strided_slice %90, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%92 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%93 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%94 = vector.transfer_read %alloc_3[%92, %93], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%95 = vector.insert_strided_slice %94, %91 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%96 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%97 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%98 = vector.transfer_read %alloc_3[%96, %97], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%99 = vector.insert_strided_slice %98, %95 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%100 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%101 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%102 = vector.transfer_read %alloc_3[%100, %101], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%103 = vector.insert_strided_slice %102, %99 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%104 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%105 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%106 = vector.transfer_read %alloc_3[%104, %105], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%107 = vector.insert_strided_slice %106, %103 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%108 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%109 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%110 = vector.transfer_read %alloc_3[%108, %109], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%111 = vector.insert_strided_slice %110, %107 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%112 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%113 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%114 = vector.transfer_read %alloc_3[%112, %113], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%115 = vector.insert_strided_slice %114, %111 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%116 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%117 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%118 = vector.transfer_read %alloc_3[%116, %117], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%119 = vector.insert_strided_slice %118, %115 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%120 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%121 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%122 = vector.transfer_read %alloc[%120, %121], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%123 = vector.insert_strided_slice %122, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%124 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%125 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%126 = vector.transfer_read %alloc[%124, %125], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%127 = vector.insert_strided_slice %126, %123 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%128 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%129 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%130 = vector.transfer_read %alloc[%128, %129], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%131 = vector.insert_strided_slice %130, %127 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%132 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%133 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%134 = vector.transfer_read %alloc[%132, %133], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%135 = vector.insert_strided_slice %134, %131 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%136 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%137 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%138 = vector.transfer_read %alloc[%136, %137], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%139 = vector.insert_strided_slice %138, %135 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%140 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%141 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%142 = vector.transfer_read %alloc[%140, %141], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%143 = vector.insert_strided_slice %142, %139 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%144 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%145 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%146 = vector.transfer_read %alloc[%144, %145], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%147 = vector.insert_strided_slice %146, %143 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%148 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%149 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%150 = vector.transfer_read %alloc[%148, %149], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%151 = vector.insert_strided_slice %150, %147 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%152 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%153 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%154 = vector.transfer_read %alloc[%152, %153], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%155 = vector.insert_strided_slice %154, %151 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%156 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%157 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%158 = vector.transfer_read %alloc[%156, %157], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%159 = vector.insert_strided_slice %158, %155 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%160 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%161 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%162 = vector.transfer_read %alloc[%160, %161], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%163 = vector.insert_strided_slice %162, %159 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%164 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%165 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%166 = vector.transfer_read %alloc[%164, %165], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%167 = vector.insert_strided_slice %166, %163 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%168 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%169 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%170 = vector.transfer_read %alloc[%168, %169], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%171 = vector.insert_strided_slice %170, %167 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%172 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%173 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%174 = vector.transfer_read %alloc[%172, %173], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%175 = vector.insert_strided_slice %174, %171 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%176 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%177 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%178 = vector.transfer_read %alloc[%176, %177], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%179 = vector.insert_strided_slice %178, %175 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%180 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%181 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%182 = vector.transfer_read %alloc[%180, %181], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%183 = vector.insert_strided_slice %182, %179 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%184 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%185 = vector.extract %119[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%186 = vector.extract %183[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%187 = vector.shape_cast %185 : vector<1x1x1x4xf16> to vector<4xf16>
%188 = vector.shape_cast %186 : vector<1x1x4x1xf16> to vector<4xf16>
%189 = vector.shape_cast %184 : vector<1x1x4x1xf32> to vector<4xf32>
%190 = amdgpu.mfma %187 * %188 + %189 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%191 = vector.extract %119[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%192 = vector.extract %183[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%193 = vector.shape_cast %191 : vector<1x1x1x4xf16> to vector<4xf16>
%194 = vector.shape_cast %192 : vector<1x1x4x1xf16> to vector<4xf16>
%195 = amdgpu.mfma %193 * %194 + %190 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%196 = vector.extract %119[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%197 = vector.extract %183[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%198 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16>
%199 = vector.shape_cast %197 : vector<1x1x4x1xf16> to vector<4xf16>
%200 = amdgpu.mfma %198 * %199 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%201 = vector.extract %119[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%202 = vector.extract %183[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%203 = vector.shape_cast %201 : vector<1x1x1x4xf16> to vector<4xf16>
%204 = vector.shape_cast %202 : vector<1x1x4x1xf16> to vector<4xf16>
%205 = amdgpu.mfma %203 * %204 + %200 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%206 = vector.extract %119[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%207 = vector.extract %183[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%208 = vector.shape_cast %206 : vector<1x1x1x4xf16> to vector<4xf16>
%209 = vector.shape_cast %207 : vector<1x1x4x1xf16> to vector<4xf16>
%210 = amdgpu.mfma %208 * %209 + %205 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%211 = vector.extract %119[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%212 = vector.extract %183[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%213 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16>
%214 = vector.shape_cast %212 : vector<1x1x4x1xf16> to vector<4xf16>
%215 = amdgpu.mfma %213 * %214 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%216 = vector.extract %119[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%217 = vector.extract %183[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%218 = vector.shape_cast %216 : vector<1x1x1x4xf16> to vector<4xf16>
%219 = vector.shape_cast %217 : vector<1x1x4x1xf16> to vector<4xf16>
%220 = amdgpu.mfma %218 * %219 + %215 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%221 = vector.extract %119[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%222 = vector.extract %183[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%223 = vector.shape_cast %221 : vector<1x1x1x4xf16> to vector<4xf16>
%224 = vector.shape_cast %222 : vector<1x1x4x1xf16> to vector<4xf16>
%225 = amdgpu.mfma %223 * %224 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32>
%227 = vector.insert %226, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%228 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%229 = vector.extract %119[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%230 = vector.extract %183[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%231 = vector.shape_cast %229 : vector<1x1x1x4xf16> to vector<4xf16>
%232 = vector.shape_cast %230 : vector<1x1x4x1xf16> to vector<4xf16>
%233 = vector.shape_cast %228 : vector<1x1x4x1xf32> to vector<4xf32>
%234 = amdgpu.mfma %231 * %232 + %233 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%235 = vector.extract %119[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%236 = vector.extract %183[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%237 = vector.shape_cast %235 : vector<1x1x1x4xf16> to vector<4xf16>
%238 = vector.shape_cast %236 : vector<1x1x4x1xf16> to vector<4xf16>
%239 = amdgpu.mfma %237 * %238 + %234 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%240 = vector.extract %119[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%241 = vector.extract %183[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%242 = vector.shape_cast %240 : vector<1x1x1x4xf16> to vector<4xf16>
%243 = vector.shape_cast %241 : vector<1x1x4x1xf16> to vector<4xf16>
%244 = amdgpu.mfma %242 * %243 + %239 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%245 = vector.extract %119[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%246 = vector.extract %183[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%247 = vector.shape_cast %245 : vector<1x1x1x4xf16> to vector<4xf16>
%248 = vector.shape_cast %246 : vector<1x1x4x1xf16> to vector<4xf16>
%249 = amdgpu.mfma %247 * %248 + %244 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%250 = vector.extract %119[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%251 = vector.extract %183[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%252 = vector.shape_cast %250 : vector<1x1x1x4xf16> to vector<4xf16>
%253 = vector.shape_cast %251 : vector<1x1x4x1xf16> to vector<4xf16>
%254 = amdgpu.mfma %252 * %253 + %249 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%255 = vector.extract %119[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%256 = vector.extract %183[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%257 = vector.shape_cast %255 : vector<1x1x1x4xf16> to vector<4xf16>
%258 = vector.shape_cast %256 : vector<1x1x4x1xf16> to vector<4xf16>
%259 = amdgpu.mfma %257 * %258 + %254 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%260 = vector.extract %119[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%261 = vector.extract %183[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%262 = vector.shape_cast %260 : vector<1x1x1x4xf16> to vector<4xf16>
%263 = vector.shape_cast %261 : vector<1x1x4x1xf16> to vector<4xf16>
%264 = amdgpu.mfma %262 * %263 + %259 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%265 = vector.extract %119[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%266 = vector.extract %183[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%267 = vector.shape_cast %265 : vector<1x1x1x4xf16> to vector<4xf16>
%268 = vector.shape_cast %266 : vector<1x1x4x1xf16> to vector<4xf16>
%269 = amdgpu.mfma %267 * %268 + %264 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%270 = vector.shape_cast %269 : vector<4xf32> to vector<1x1x4x1xf32>
%271 = vector.insert %270, %227 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%272 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%273 = vector.extract %119[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%274 = vector.extract %183[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%275 = vector.shape_cast %273 : vector<1x1x1x4xf16> to vector<4xf16>
%276 = vector.shape_cast %274 : vector<1x1x4x1xf16> to vector<4xf16>
%277 = vector.shape_cast %272 : vector<1x1x4x1xf32> to vector<4xf32>
%278 = amdgpu.mfma %275 * %276 + %277 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%279 = vector.extract %119[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%280 = vector.extract %183[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%281 = vector.shape_cast %279 : vector<1x1x1x4xf16> to vector<4xf16>
%282 = vector.shape_cast %280 : vector<1x1x4x1xf16> to vector<4xf16>
%283 = amdgpu.mfma %281 * %282 + %278 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%284 = vector.extract %119[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%285 = vector.extract %183[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%286 = vector.shape_cast %284 : vector<1x1x1x4xf16> to vector<4xf16>
%287 = vector.shape_cast %285 : vector<1x1x4x1xf16> to vector<4xf16>
%288 = amdgpu.mfma %286 * %287 + %283 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%289 = vector.extract %119[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%290 = vector.extract %183[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%291 = vector.shape_cast %289 : vector<1x1x1x4xf16> to vector<4xf16>
%292 = vector.shape_cast %290 : vector<1x1x4x1xf16> to vector<4xf16>
%293 = amdgpu.mfma %291 * %292 + %288 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%294 = vector.extract %119[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%295 = vector.extract %183[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%296 = vector.shape_cast %294 : vector<1x1x1x4xf16> to vector<4xf16>
%297 = vector.shape_cast %295 : vector<1x1x4x1xf16> to vector<4xf16>
%298 = amdgpu.mfma %296 * %297 + %293 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%299 = vector.extract %119[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%300 = vector.extract %183[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%301 = vector.shape_cast %299 : vector<1x1x1x4xf16> to vector<4xf16>
%302 = vector.shape_cast %300 : vector<1x1x4x1xf16> to vector<4xf16>
%303 = amdgpu.mfma %301 * %302 + %298 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%304 = vector.extract %119[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%305 = vector.extract %183[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%306 = vector.shape_cast %304 : vector<1x1x1x4xf16> to vector<4xf16>
%307 = vector.shape_cast %305 : vector<1x1x4x1xf16> to vector<4xf16>
%308 = amdgpu.mfma %306 * %307 + %303 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%309 = vector.extract %119[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%310 = vector.extract %183[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%311 = vector.shape_cast %309 : vector<1x1x1x4xf16> to vector<4xf16>
%312 = vector.shape_cast %310 : vector<1x1x4x1xf16> to vector<4xf16>
%313 = amdgpu.mfma %311 * %312 + %308 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%314 = vector.shape_cast %313 : vector<4xf32> to vector<1x1x4x1xf32>
%315 = vector.insert %314, %271 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%316 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%317 = vector.extract %119[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%318 = vector.extract %183[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%319 = vector.shape_cast %317 : vector<1x1x1x4xf16> to vector<4xf16>
%320 = vector.shape_cast %318 : vector<1x1x4x1xf16> to vector<4xf16>
%321 = vector.shape_cast %316 : vector<1x1x4x1xf32> to vector<4xf32>
%322 = amdgpu.mfma %319 * %320 + %321 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%323 = vector.extract %119[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%324 = vector.extract %183[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%325 = vector.shape_cast %323 : vector<1x1x1x4xf16> to vector<4xf16>
%326 = vector.shape_cast %324 : vector<1x1x4x1xf16> to vector<4xf16>
%327 = amdgpu.mfma %325 * %326 + %322 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%328 = vector.extract %119[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%329 = vector.extract %183[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%330 = vector.shape_cast %328 : vector<1x1x1x4xf16> to vector<4xf16>
%331 = vector.shape_cast %329 : vector<1x1x4x1xf16> to vector<4xf16>
%332 = amdgpu.mfma %330 * %331 + %327 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%333 = vector.extract %119[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%334 = vector.extract %183[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%335 = vector.shape_cast %333 : vector<1x1x1x4xf16> to vector<4xf16>
%336 = vector.shape_cast %334 : vector<1x1x4x1xf16> to vector<4xf16>
%337 = amdgpu.mfma %335 * %336 + %332 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%338 = vector.extract %119[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%339 = vector.extract %183[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%340 = vector.shape_cast %338 : vector<1x1x1x4xf16> to vector<4xf16>
%341 = vector.shape_cast %339 : vector<1x1x4x1xf16> to vector<4xf16>
%342 = amdgpu.mfma %340 * %341 + %337 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%343 = vector.extract %119[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%344 = vector.extract %183[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%345 = vector.shape_cast %343 : vector<1x1x1x4xf16> to vector<4xf16>
%346 = vector.shape_cast %344 : vector<1x1x4x1xf16> to vector<4xf16>
%347 = amdgpu.mfma %345 * %346 + %342 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%348 = vector.extract %119[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%349 = vector.extract %183[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%350 = vector.shape_cast %348 : vector<1x1x1x4xf16> to vector<4xf16>
%351 = vector.shape_cast %349 : vector<1x1x4x1xf16> to vector<4xf16>
%352 = amdgpu.mfma %350 * %351 + %347 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%353 = vector.extract %119[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%354 = vector.extract %183[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%355 = vector.shape_cast %353 : vector<1x1x1x4xf16> to vector<4xf16>
%356 = vector.shape_cast %354 : vector<1x1x4x1xf16> to vector<4xf16>
%357 = amdgpu.mfma %355 * %356 + %352 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%358 = vector.shape_cast %357 : vector<4xf32> to vector<1x1x4x1xf32>
%359 = vector.insert %358, %315 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
scf.yield %359 : vector<2x2x1x1x4x1xf32>
}
%4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%6 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %6, %subview[%4, %5] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%9 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %9, %subview[%7, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%10 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%11 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%12 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %12, %subview[%10, %11] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%13 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%14 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%15 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %15, %subview[%13, %14] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %alloc_3 : memref<64x128xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<128x64xf16, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16>
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32>
%cst_2 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%alloc = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) {
gpu.barrier
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x]
%13 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x]
%14 = vector.transfer_read %0[%12, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x]
%16 = vector.transfer_read %0[%15, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x]
%18 = vector.transfer_read %0[%17, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x]
%20 = vector.transfer_read %0[%19, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x]
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x]
%23 = vector.transfer_read %1[%21, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x]
%25 = vector.transfer_read %1[%24, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x]
%27 = vector.transfer_read %1[%26, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x]
%29 = vector.transfer_read %1[%28, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x]
%31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %14, %alloc_3[%30, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x]
vector.transfer_write %16, %alloc_3[%32, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%33 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x]
vector.transfer_write %18, %alloc_3[%33, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x]
vector.transfer_write %20, %alloc_3[%34, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>>
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x]
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %23, %alloc[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x]
vector.transfer_write %25, %alloc[%37, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x]
vector.transfer_write %27, %alloc[%38, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
%39 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x]
vector.transfer_write %29, %alloc[%39, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>>
gpu.barrier
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%42 = vector.transfer_read %alloc_3[%40, %41], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%43 = vector.insert_strided_slice %42, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%45 = vector.transfer_read %alloc_3[%40, %44], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%46 = vector.insert_strided_slice %45, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%48 = vector.transfer_read %alloc_3[%40, %47], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%49 = vector.insert_strided_slice %48, %46 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%51 = vector.transfer_read %alloc_3[%40, %50], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%52 = vector.insert_strided_slice %51, %49 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%54 = vector.transfer_read %alloc_3[%40, %53], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%55 = vector.insert_strided_slice %54, %52 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%56 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%57 = vector.transfer_read %alloc_3[%40, %56], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%60 = vector.transfer_read %alloc_3[%40, %59], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%63 = vector.transfer_read %alloc_3[%40, %62], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%66 = vector.transfer_read %alloc_3[%65, %41], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%67 = vector.insert_strided_slice %66, %64 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%68 = vector.transfer_read %alloc_3[%65, %44], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%69 = vector.insert_strided_slice %68, %67 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%70 = vector.transfer_read %alloc_3[%65, %47], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%71 = vector.insert_strided_slice %70, %69 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%72 = vector.transfer_read %alloc_3[%65, %50], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%73 = vector.insert_strided_slice %72, %71 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%74 = vector.transfer_read %alloc_3[%65, %53], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%75 = vector.insert_strided_slice %74, %73 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%76 = vector.transfer_read %alloc_3[%65, %56], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%77 = vector.insert_strided_slice %76, %75 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%78 = vector.transfer_read %alloc_3[%65, %59], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%79 = vector.insert_strided_slice %78, %77 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%80 = vector.transfer_read %alloc_3[%65, %62], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%81 = vector.insert_strided_slice %80, %79 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%83 = vector.transfer_read %alloc[%41, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%84 = vector.insert_strided_slice %83, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%86 = vector.transfer_read %alloc[%41, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%87 = vector.insert_strided_slice %86, %84 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%88 = vector.transfer_read %alloc[%44, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%89 = vector.insert_strided_slice %88, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%90 = vector.transfer_read %alloc[%44, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%91 = vector.insert_strided_slice %90, %89 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%92 = vector.transfer_read %alloc[%47, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%93 = vector.insert_strided_slice %92, %91 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%94 = vector.transfer_read %alloc[%47, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%95 = vector.insert_strided_slice %94, %93 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%96 = vector.transfer_read %alloc[%50, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%97 = vector.insert_strided_slice %96, %95 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%98 = vector.transfer_read %alloc[%50, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%99 = vector.insert_strided_slice %98, %97 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%100 = vector.transfer_read %alloc[%53, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%101 = vector.insert_strided_slice %100, %99 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%102 = vector.transfer_read %alloc[%53, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%103 = vector.insert_strided_slice %102, %101 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%104 = vector.transfer_read %alloc[%56, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%105 = vector.insert_strided_slice %104, %103 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%106 = vector.transfer_read %alloc[%56, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%107 = vector.insert_strided_slice %106, %105 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%108 = vector.transfer_read %alloc[%59, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%109 = vector.insert_strided_slice %108, %107 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%110 = vector.transfer_read %alloc[%59, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%111 = vector.insert_strided_slice %110, %109 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%112 = vector.transfer_read %alloc[%62, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%113 = vector.insert_strided_slice %112, %111 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%114 = vector.transfer_read %alloc[%62, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%115 = vector.insert_strided_slice %114, %113 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%116 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%117 = vector.extract %81[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%118 = vector.extract %115[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%119 = vector.shape_cast %117 : vector<1x1x1x4xf16> to vector<4xf16>
%120 = vector.shape_cast %118 : vector<1x1x4x1xf16> to vector<4xf16>
%121 = vector.shape_cast %116 : vector<1x1x4x1xf32> to vector<4xf32>
%122 = amdgpu.mfma %119 * %120 + %121 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%123 = vector.extract %81[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%124 = vector.extract %115[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%125 = vector.shape_cast %123 : vector<1x1x1x4xf16> to vector<4xf16>
%126 = vector.shape_cast %124 : vector<1x1x4x1xf16> to vector<4xf16>
%127 = amdgpu.mfma %125 * %126 + %122 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%128 = vector.extract %81[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%129 = vector.extract %115[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%130 = vector.shape_cast %128 : vector<1x1x1x4xf16> to vector<4xf16>
%131 = vector.shape_cast %129 : vector<1x1x4x1xf16> to vector<4xf16>
%132 = amdgpu.mfma %130 * %131 + %127 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%133 = vector.extract %81[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%134 = vector.extract %115[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%135 = vector.shape_cast %133 : vector<1x1x1x4xf16> to vector<4xf16>
%136 = vector.shape_cast %134 : vector<1x1x4x1xf16> to vector<4xf16>
%137 = amdgpu.mfma %135 * %136 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%138 = vector.extract %81[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%139 = vector.extract %115[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%140 = vector.shape_cast %138 : vector<1x1x1x4xf16> to vector<4xf16>
%141 = vector.shape_cast %139 : vector<1x1x4x1xf16> to vector<4xf16>
%142 = amdgpu.mfma %140 * %141 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%143 = vector.extract %81[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%144 = vector.extract %115[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%145 = vector.shape_cast %143 : vector<1x1x1x4xf16> to vector<4xf16>
%146 = vector.shape_cast %144 : vector<1x1x4x1xf16> to vector<4xf16>
%147 = amdgpu.mfma %145 * %146 + %142 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%148 = vector.extract %81[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%149 = vector.extract %115[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%150 = vector.shape_cast %148 : vector<1x1x1x4xf16> to vector<4xf16>
%151 = vector.shape_cast %149 : vector<1x1x4x1xf16> to vector<4xf16>
%152 = amdgpu.mfma %150 * %151 + %147 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%153 = vector.extract %81[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%154 = vector.extract %115[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%155 = vector.shape_cast %153 : vector<1x1x1x4xf16> to vector<4xf16>
%156 = vector.shape_cast %154 : vector<1x1x4x1xf16> to vector<4xf16>
%157 = amdgpu.mfma %155 * %156 + %152 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%158 = vector.shape_cast %157 : vector<4xf32> to vector<1x1x4x1xf32>
%159 = vector.insert %158, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%160 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%161 = vector.extract %115[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%162 = vector.shape_cast %161 : vector<1x1x4x1xf16> to vector<4xf16>
%163 = vector.shape_cast %160 : vector<1x1x4x1xf32> to vector<4xf32>
%164 = amdgpu.mfma %119 * %162 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%165 = vector.extract %115[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%166 = vector.shape_cast %165 : vector<1x1x4x1xf16> to vector<4xf16>
%167 = amdgpu.mfma %125 * %166 + %164 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%168 = vector.extract %115[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%169 = vector.shape_cast %168 : vector<1x1x4x1xf16> to vector<4xf16>
%170 = amdgpu.mfma %130 * %169 + %167 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%171 = vector.extract %115[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%172 = vector.shape_cast %171 : vector<1x1x4x1xf16> to vector<4xf16>
%173 = amdgpu.mfma %135 * %172 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%174 = vector.extract %115[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%175 = vector.shape_cast %174 : vector<1x1x4x1xf16> to vector<4xf16>
%176 = amdgpu.mfma %140 * %175 + %173 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%177 = vector.extract %115[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%178 = vector.shape_cast %177 : vector<1x1x4x1xf16> to vector<4xf16>
%179 = amdgpu.mfma %145 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%180 = vector.extract %115[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%181 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16>
%182 = amdgpu.mfma %150 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%183 = vector.extract %115[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%184 = vector.shape_cast %183 : vector<1x1x4x1xf16> to vector<4xf16>
%185 = amdgpu.mfma %155 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x4x1xf32>
%187 = vector.insert %186, %159 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%188 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%189 = vector.extract %81[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%190 = vector.shape_cast %189 : vector<1x1x1x4xf16> to vector<4xf16>
%191 = vector.shape_cast %188 : vector<1x1x4x1xf32> to vector<4xf32>
%192 = amdgpu.mfma %190 * %120 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%193 = vector.extract %81[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%194 = vector.shape_cast %193 : vector<1x1x1x4xf16> to vector<4xf16>
%195 = amdgpu.mfma %194 * %126 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%196 = vector.extract %81[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%197 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16>
%198 = amdgpu.mfma %197 * %131 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%199 = vector.extract %81[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%200 = vector.shape_cast %199 : vector<1x1x1x4xf16> to vector<4xf16>
%201 = amdgpu.mfma %200 * %136 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%202 = vector.extract %81[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%203 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16>
%204 = amdgpu.mfma %203 * %141 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%205 = vector.extract %81[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%206 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16>
%207 = amdgpu.mfma %206 * %146 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%208 = vector.extract %81[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%209 = vector.shape_cast %208 : vector<1x1x1x4xf16> to vector<4xf16>
%210 = amdgpu.mfma %209 * %151 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%211 = vector.extract %81[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%212 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16>
%213 = amdgpu.mfma %212 * %156 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%214 = vector.shape_cast %213 : vector<4xf32> to vector<1x1x4x1xf32>
%215 = vector.insert %214, %187 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%216 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%217 = vector.shape_cast %216 : vector<1x1x4x1xf32> to vector<4xf32>
%218 = amdgpu.mfma %190 * %162 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%219 = amdgpu.mfma %194 * %166 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%220 = amdgpu.mfma %197 * %169 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%221 = amdgpu.mfma %200 * %172 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%222 = amdgpu.mfma %203 * %175 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%223 = amdgpu.mfma %206 * %178 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%224 = amdgpu.mfma %209 * %181 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%225 = amdgpu.mfma %212 * %184 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32>
%227 = vector.insert %226, %215 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
scf.yield %227 : vector<2x2x1x1x4x1xf32>
}
%4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%6 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %6, %subview[%4, %5] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%8 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %8, %subview[%4, %7] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%10 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %10, %subview[%9, %5] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%11 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %11, %subview[%9, %7] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %alloc_3 : memref<64x128xf16, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<128x64xf16, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After GPUReduceBankConflictsPass (iree-codegen-gpu-reduce-bank-conflicts) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16>
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32>
%cst_2 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%alloc = memref.alloc() : memref<128x68xf16, #gpu.address_space<workgroup>>
%subview = memref.subview %alloc[0, 0] [128, 64] [1, 1] : memref<128x68xf16, #gpu.address_space<workgroup>> to memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<64x132xf16, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %alloc_3[0, 0] [64, 128] [1, 1] : memref<64x132xf16, #gpu.address_space<workgroup>> to memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview_5 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) {
gpu.barrier
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x]
%13 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x]
%14 = vector.transfer_read %0[%12, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x]
%16 = vector.transfer_read %0[%15, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x]
%18 = vector.transfer_read %0[%17, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x]
%20 = vector.transfer_read %0[%19, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x]
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x]
%23 = vector.transfer_read %1[%21, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x]
%25 = vector.transfer_read %1[%24, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x]
%27 = vector.transfer_read %1[%26, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x]
%29 = vector.transfer_read %1[%28, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x]
%31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %14, %subview_4[%30, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x]
vector.transfer_write %16, %subview_4[%32, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%33 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x]
vector.transfer_write %18, %subview_4[%33, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x]
vector.transfer_write %20, %subview_4[%34, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x]
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %23, %subview[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x]
vector.transfer_write %25, %subview[%37, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x]
vector.transfer_write %27, %subview[%38, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
%39 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x]
vector.transfer_write %29, %subview[%39, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
gpu.barrier
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%42 = vector.transfer_read %subview_4[%40, %41], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%43 = vector.insert_strided_slice %42, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%45 = vector.transfer_read %subview_4[%40, %44], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%46 = vector.insert_strided_slice %45, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%48 = vector.transfer_read %subview_4[%40, %47], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%49 = vector.insert_strided_slice %48, %46 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%51 = vector.transfer_read %subview_4[%40, %50], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%52 = vector.insert_strided_slice %51, %49 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%54 = vector.transfer_read %subview_4[%40, %53], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%55 = vector.insert_strided_slice %54, %52 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%56 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%57 = vector.transfer_read %subview_4[%40, %56], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%60 = vector.transfer_read %subview_4[%40, %59], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%63 = vector.transfer_read %subview_4[%40, %62], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%66 = vector.transfer_read %subview_4[%65, %41], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%67 = vector.insert_strided_slice %66, %64 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%68 = vector.transfer_read %subview_4[%65, %44], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%69 = vector.insert_strided_slice %68, %67 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%70 = vector.transfer_read %subview_4[%65, %47], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%71 = vector.insert_strided_slice %70, %69 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%72 = vector.transfer_read %subview_4[%65, %50], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%73 = vector.insert_strided_slice %72, %71 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%74 = vector.transfer_read %subview_4[%65, %53], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%75 = vector.insert_strided_slice %74, %73 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%76 = vector.transfer_read %subview_4[%65, %56], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%77 = vector.insert_strided_slice %76, %75 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%78 = vector.transfer_read %subview_4[%65, %59], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%79 = vector.insert_strided_slice %78, %77 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%80 = vector.transfer_read %subview_4[%65, %62], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%81 = vector.insert_strided_slice %80, %79 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%83 = vector.transfer_read %subview[%41, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%84 = vector.insert_strided_slice %83, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%86 = vector.transfer_read %subview[%41, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%87 = vector.insert_strided_slice %86, %84 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%88 = vector.transfer_read %subview[%44, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%89 = vector.insert_strided_slice %88, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%90 = vector.transfer_read %subview[%44, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%91 = vector.insert_strided_slice %90, %89 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%92 = vector.transfer_read %subview[%47, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%93 = vector.insert_strided_slice %92, %91 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%94 = vector.transfer_read %subview[%47, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%95 = vector.insert_strided_slice %94, %93 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%96 = vector.transfer_read %subview[%50, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%97 = vector.insert_strided_slice %96, %95 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%98 = vector.transfer_read %subview[%50, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%99 = vector.insert_strided_slice %98, %97 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%100 = vector.transfer_read %subview[%53, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%101 = vector.insert_strided_slice %100, %99 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%102 = vector.transfer_read %subview[%53, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%103 = vector.insert_strided_slice %102, %101 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%104 = vector.transfer_read %subview[%56, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%105 = vector.insert_strided_slice %104, %103 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%106 = vector.transfer_read %subview[%56, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%107 = vector.insert_strided_slice %106, %105 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%108 = vector.transfer_read %subview[%59, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%109 = vector.insert_strided_slice %108, %107 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%110 = vector.transfer_read %subview[%59, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%111 = vector.insert_strided_slice %110, %109 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%112 = vector.transfer_read %subview[%62, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%113 = vector.insert_strided_slice %112, %111 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%114 = vector.transfer_read %subview[%62, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%115 = vector.insert_strided_slice %114, %113 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%116 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%117 = vector.extract %81[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%118 = vector.extract %115[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%119 = vector.shape_cast %117 : vector<1x1x1x4xf16> to vector<4xf16>
%120 = vector.shape_cast %118 : vector<1x1x4x1xf16> to vector<4xf16>
%121 = vector.shape_cast %116 : vector<1x1x4x1xf32> to vector<4xf32>
%122 = amdgpu.mfma %119 * %120 + %121 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%123 = vector.extract %81[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%124 = vector.extract %115[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%125 = vector.shape_cast %123 : vector<1x1x1x4xf16> to vector<4xf16>
%126 = vector.shape_cast %124 : vector<1x1x4x1xf16> to vector<4xf16>
%127 = amdgpu.mfma %125 * %126 + %122 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%128 = vector.extract %81[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%129 = vector.extract %115[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%130 = vector.shape_cast %128 : vector<1x1x1x4xf16> to vector<4xf16>
%131 = vector.shape_cast %129 : vector<1x1x4x1xf16> to vector<4xf16>
%132 = amdgpu.mfma %130 * %131 + %127 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%133 = vector.extract %81[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%134 = vector.extract %115[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%135 = vector.shape_cast %133 : vector<1x1x1x4xf16> to vector<4xf16>
%136 = vector.shape_cast %134 : vector<1x1x4x1xf16> to vector<4xf16>
%137 = amdgpu.mfma %135 * %136 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%138 = vector.extract %81[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%139 = vector.extract %115[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%140 = vector.shape_cast %138 : vector<1x1x1x4xf16> to vector<4xf16>
%141 = vector.shape_cast %139 : vector<1x1x4x1xf16> to vector<4xf16>
%142 = amdgpu.mfma %140 * %141 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%143 = vector.extract %81[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%144 = vector.extract %115[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%145 = vector.shape_cast %143 : vector<1x1x1x4xf16> to vector<4xf16>
%146 = vector.shape_cast %144 : vector<1x1x4x1xf16> to vector<4xf16>
%147 = amdgpu.mfma %145 * %146 + %142 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%148 = vector.extract %81[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%149 = vector.extract %115[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%150 = vector.shape_cast %148 : vector<1x1x1x4xf16> to vector<4xf16>
%151 = vector.shape_cast %149 : vector<1x1x4x1xf16> to vector<4xf16>
%152 = amdgpu.mfma %150 * %151 + %147 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%153 = vector.extract %81[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%154 = vector.extract %115[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%155 = vector.shape_cast %153 : vector<1x1x1x4xf16> to vector<4xf16>
%156 = vector.shape_cast %154 : vector<1x1x4x1xf16> to vector<4xf16>
%157 = amdgpu.mfma %155 * %156 + %152 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%158 = vector.shape_cast %157 : vector<4xf32> to vector<1x1x4x1xf32>
%159 = vector.insert %158, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%160 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%161 = vector.extract %115[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%162 = vector.shape_cast %161 : vector<1x1x4x1xf16> to vector<4xf16>
%163 = vector.shape_cast %160 : vector<1x1x4x1xf32> to vector<4xf32>
%164 = amdgpu.mfma %119 * %162 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%165 = vector.extract %115[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%166 = vector.shape_cast %165 : vector<1x1x4x1xf16> to vector<4xf16>
%167 = amdgpu.mfma %125 * %166 + %164 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%168 = vector.extract %115[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%169 = vector.shape_cast %168 : vector<1x1x4x1xf16> to vector<4xf16>
%170 = amdgpu.mfma %130 * %169 + %167 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%171 = vector.extract %115[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%172 = vector.shape_cast %171 : vector<1x1x4x1xf16> to vector<4xf16>
%173 = amdgpu.mfma %135 * %172 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%174 = vector.extract %115[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%175 = vector.shape_cast %174 : vector<1x1x4x1xf16> to vector<4xf16>
%176 = amdgpu.mfma %140 * %175 + %173 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%177 = vector.extract %115[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%178 = vector.shape_cast %177 : vector<1x1x4x1xf16> to vector<4xf16>
%179 = amdgpu.mfma %145 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%180 = vector.extract %115[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%181 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16>
%182 = amdgpu.mfma %150 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%183 = vector.extract %115[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%184 = vector.shape_cast %183 : vector<1x1x4x1xf16> to vector<4xf16>
%185 = amdgpu.mfma %155 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x4x1xf32>
%187 = vector.insert %186, %159 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%188 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%189 = vector.extract %81[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%190 = vector.shape_cast %189 : vector<1x1x1x4xf16> to vector<4xf16>
%191 = vector.shape_cast %188 : vector<1x1x4x1xf32> to vector<4xf32>
%192 = amdgpu.mfma %190 * %120 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%193 = vector.extract %81[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%194 = vector.shape_cast %193 : vector<1x1x1x4xf16> to vector<4xf16>
%195 = amdgpu.mfma %194 * %126 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%196 = vector.extract %81[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%197 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16>
%198 = amdgpu.mfma %197 * %131 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%199 = vector.extract %81[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%200 = vector.shape_cast %199 : vector<1x1x1x4xf16> to vector<4xf16>
%201 = amdgpu.mfma %200 * %136 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%202 = vector.extract %81[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%203 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16>
%204 = amdgpu.mfma %203 * %141 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%205 = vector.extract %81[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%206 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16>
%207 = amdgpu.mfma %206 * %146 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%208 = vector.extract %81[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%209 = vector.shape_cast %208 : vector<1x1x1x4xf16> to vector<4xf16>
%210 = amdgpu.mfma %209 * %151 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%211 = vector.extract %81[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%212 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16>
%213 = amdgpu.mfma %212 * %156 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%214 = vector.shape_cast %213 : vector<4xf32> to vector<1x1x4x1xf32>
%215 = vector.insert %214, %187 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%216 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%217 = vector.shape_cast %216 : vector<1x1x4x1xf32> to vector<4xf32>
%218 = amdgpu.mfma %190 * %162 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%219 = amdgpu.mfma %194 * %166 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%220 = amdgpu.mfma %197 * %169 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%221 = amdgpu.mfma %200 * %172 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%222 = amdgpu.mfma %203 * %175 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%223 = amdgpu.mfma %206 * %178 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%224 = amdgpu.mfma %209 * %181 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%225 = amdgpu.mfma %212 * %184 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32>
%227 = vector.insert %226, %215 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
scf.yield %227 : vector<2x2x1x1x4x1xf32>
}
%4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%6 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %6, %subview_5[%4, %5] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%8 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %8, %subview_5[%4, %7] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%10 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %10, %subview_5[%9, %5] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%11 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %11, %subview_5[%9, %7] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %subview_4 : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
memref.dealloc %subview : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After LLVMGPUPrefetchSharedMemoryPass (iree-llvmgpu-prefetch-shared-memory) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16>
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32>
%cst_2 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%alloc = memref.alloc() : memref<128x68xf16, #gpu.address_space<workgroup>>
%subview = memref.subview %alloc[0, 0] [128, 64] [1, 1] : memref<128x68xf16, #gpu.address_space<workgroup>> to memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<64x132xf16, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %alloc_3[0, 0] [64, 128] [1, 1] : memref<64x132xf16, #gpu.address_space<workgroup>> to memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%subview_5 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) {
gpu.barrier
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x]
%13 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x]
%14 = vector.transfer_read %0[%12, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x]
%16 = vector.transfer_read %0[%15, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x]
%18 = vector.transfer_read %0[%17, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x]
%20 = vector.transfer_read %0[%19, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x]
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x]
%23 = vector.transfer_read %1[%21, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x]
%25 = vector.transfer_read %1[%24, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x]
%27 = vector.transfer_read %1[%26, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x]
%29 = vector.transfer_read %1[%28, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x]
%31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %14, %subview_4[%30, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x]
vector.transfer_write %16, %subview_4[%32, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%33 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x]
vector.transfer_write %18, %subview_4[%33, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x]
vector.transfer_write %20, %subview_4[%34, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x]
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %23, %subview[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x]
vector.transfer_write %25, %subview[%37, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x]
vector.transfer_write %27, %subview[%38, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
%39 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x]
vector.transfer_write %29, %subview[%39, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
gpu.barrier
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%42 = vector.transfer_read %subview_4[%40, %41], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%43 = vector.insert_strided_slice %42, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%45 = vector.transfer_read %subview_4[%40, %44], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%46 = vector.insert_strided_slice %45, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%48 = vector.transfer_read %subview_4[%40, %47], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%49 = vector.insert_strided_slice %48, %46 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%51 = vector.transfer_read %subview_4[%40, %50], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%52 = vector.insert_strided_slice %51, %49 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%54 = vector.transfer_read %subview_4[%40, %53], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%55 = vector.insert_strided_slice %54, %52 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%56 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%57 = vector.transfer_read %subview_4[%40, %56], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%60 = vector.transfer_read %subview_4[%40, %59], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%63 = vector.transfer_read %subview_4[%40, %62], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%66 = vector.transfer_read %subview_4[%65, %41], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%67 = vector.insert_strided_slice %66, %64 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%68 = vector.transfer_read %subview_4[%65, %44], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%69 = vector.insert_strided_slice %68, %67 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%70 = vector.transfer_read %subview_4[%65, %47], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%71 = vector.insert_strided_slice %70, %69 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%72 = vector.transfer_read %subview_4[%65, %50], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%73 = vector.insert_strided_slice %72, %71 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%74 = vector.transfer_read %subview_4[%65, %53], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%75 = vector.insert_strided_slice %74, %73 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%76 = vector.transfer_read %subview_4[%65, %56], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%77 = vector.insert_strided_slice %76, %75 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%78 = vector.transfer_read %subview_4[%65, %59], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%79 = vector.insert_strided_slice %78, %77 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%80 = vector.transfer_read %subview_4[%65, %62], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16>
%81 = vector.insert_strided_slice %80, %79 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%83 = vector.transfer_read %subview[%41, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%84 = vector.insert_strided_slice %83, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%86 = vector.transfer_read %subview[%41, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%87 = vector.insert_strided_slice %86, %84 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%88 = vector.transfer_read %subview[%44, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%89 = vector.insert_strided_slice %88, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%90 = vector.transfer_read %subview[%44, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%91 = vector.insert_strided_slice %90, %89 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%92 = vector.transfer_read %subview[%47, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%93 = vector.insert_strided_slice %92, %91 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%94 = vector.transfer_read %subview[%47, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%95 = vector.insert_strided_slice %94, %93 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%96 = vector.transfer_read %subview[%50, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%97 = vector.insert_strided_slice %96, %95 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%98 = vector.transfer_read %subview[%50, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%99 = vector.insert_strided_slice %98, %97 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%100 = vector.transfer_read %subview[%53, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%101 = vector.insert_strided_slice %100, %99 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%102 = vector.transfer_read %subview[%53, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%103 = vector.insert_strided_slice %102, %101 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%104 = vector.transfer_read %subview[%56, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%105 = vector.insert_strided_slice %104, %103 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%106 = vector.transfer_read %subview[%56, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%107 = vector.insert_strided_slice %106, %105 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%108 = vector.transfer_read %subview[%59, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%109 = vector.insert_strided_slice %108, %107 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%110 = vector.transfer_read %subview[%59, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%111 = vector.insert_strided_slice %110, %109 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%112 = vector.transfer_read %subview[%62, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%113 = vector.insert_strided_slice %112, %111 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%114 = vector.transfer_read %subview[%62, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16>
%115 = vector.insert_strided_slice %114, %113 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%116 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%117 = vector.extract %81[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%118 = vector.extract %115[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%119 = vector.shape_cast %117 : vector<1x1x1x4xf16> to vector<4xf16>
%120 = vector.shape_cast %118 : vector<1x1x4x1xf16> to vector<4xf16>
%121 = vector.shape_cast %116 : vector<1x1x4x1xf32> to vector<4xf32>
%122 = amdgpu.mfma %119 * %120 + %121 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%123 = vector.extract %81[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%124 = vector.extract %115[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%125 = vector.shape_cast %123 : vector<1x1x1x4xf16> to vector<4xf16>
%126 = vector.shape_cast %124 : vector<1x1x4x1xf16> to vector<4xf16>
%127 = amdgpu.mfma %125 * %126 + %122 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%128 = vector.extract %81[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%129 = vector.extract %115[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%130 = vector.shape_cast %128 : vector<1x1x1x4xf16> to vector<4xf16>
%131 = vector.shape_cast %129 : vector<1x1x4x1xf16> to vector<4xf16>
%132 = amdgpu.mfma %130 * %131 + %127 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%133 = vector.extract %81[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%134 = vector.extract %115[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%135 = vector.shape_cast %133 : vector<1x1x1x4xf16> to vector<4xf16>
%136 = vector.shape_cast %134 : vector<1x1x4x1xf16> to vector<4xf16>
%137 = amdgpu.mfma %135 * %136 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%138 = vector.extract %81[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%139 = vector.extract %115[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%140 = vector.shape_cast %138 : vector<1x1x1x4xf16> to vector<4xf16>
%141 = vector.shape_cast %139 : vector<1x1x4x1xf16> to vector<4xf16>
%142 = amdgpu.mfma %140 * %141 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%143 = vector.extract %81[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%144 = vector.extract %115[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%145 = vector.shape_cast %143 : vector<1x1x1x4xf16> to vector<4xf16>
%146 = vector.shape_cast %144 : vector<1x1x4x1xf16> to vector<4xf16>
%147 = amdgpu.mfma %145 * %146 + %142 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%148 = vector.extract %81[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%149 = vector.extract %115[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%150 = vector.shape_cast %148 : vector<1x1x1x4xf16> to vector<4xf16>
%151 = vector.shape_cast %149 : vector<1x1x4x1xf16> to vector<4xf16>
%152 = amdgpu.mfma %150 * %151 + %147 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%153 = vector.extract %81[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%154 = vector.extract %115[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%155 = vector.shape_cast %153 : vector<1x1x1x4xf16> to vector<4xf16>
%156 = vector.shape_cast %154 : vector<1x1x4x1xf16> to vector<4xf16>
%157 = amdgpu.mfma %155 * %156 + %152 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%158 = vector.shape_cast %157 : vector<4xf32> to vector<1x1x4x1xf32>
%159 = vector.insert %158, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%160 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%161 = vector.extract %115[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%162 = vector.shape_cast %161 : vector<1x1x4x1xf16> to vector<4xf16>
%163 = vector.shape_cast %160 : vector<1x1x4x1xf32> to vector<4xf32>
%164 = amdgpu.mfma %119 * %162 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%165 = vector.extract %115[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%166 = vector.shape_cast %165 : vector<1x1x4x1xf16> to vector<4xf16>
%167 = amdgpu.mfma %125 * %166 + %164 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%168 = vector.extract %115[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%169 = vector.shape_cast %168 : vector<1x1x4x1xf16> to vector<4xf16>
%170 = amdgpu.mfma %130 * %169 + %167 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%171 = vector.extract %115[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%172 = vector.shape_cast %171 : vector<1x1x4x1xf16> to vector<4xf16>
%173 = amdgpu.mfma %135 * %172 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%174 = vector.extract %115[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%175 = vector.shape_cast %174 : vector<1x1x4x1xf16> to vector<4xf16>
%176 = amdgpu.mfma %140 * %175 + %173 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%177 = vector.extract %115[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%178 = vector.shape_cast %177 : vector<1x1x4x1xf16> to vector<4xf16>
%179 = amdgpu.mfma %145 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%180 = vector.extract %115[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%181 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16>
%182 = amdgpu.mfma %150 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%183 = vector.extract %115[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%184 = vector.shape_cast %183 : vector<1x1x4x1xf16> to vector<4xf16>
%185 = amdgpu.mfma %155 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x4x1xf32>
%187 = vector.insert %186, %159 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%188 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%189 = vector.extract %81[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%190 = vector.shape_cast %189 : vector<1x1x1x4xf16> to vector<4xf16>
%191 = vector.shape_cast %188 : vector<1x1x4x1xf32> to vector<4xf32>
%192 = amdgpu.mfma %190 * %120 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%193 = vector.extract %81[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%194 = vector.shape_cast %193 : vector<1x1x1x4xf16> to vector<4xf16>
%195 = amdgpu.mfma %194 * %126 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%196 = vector.extract %81[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%197 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16>
%198 = amdgpu.mfma %197 * %131 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%199 = vector.extract %81[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%200 = vector.shape_cast %199 : vector<1x1x1x4xf16> to vector<4xf16>
%201 = amdgpu.mfma %200 * %136 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%202 = vector.extract %81[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%203 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16>
%204 = amdgpu.mfma %203 * %141 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%205 = vector.extract %81[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%206 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16>
%207 = amdgpu.mfma %206 * %146 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%208 = vector.extract %81[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%209 = vector.shape_cast %208 : vector<1x1x1x4xf16> to vector<4xf16>
%210 = amdgpu.mfma %209 * %151 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%211 = vector.extract %81[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%212 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16>
%213 = amdgpu.mfma %212 * %156 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%214 = vector.shape_cast %213 : vector<4xf32> to vector<1x1x4x1xf32>
%215 = vector.insert %214, %187 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%216 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%217 = vector.shape_cast %216 : vector<1x1x4x1xf32> to vector<4xf32>
%218 = amdgpu.mfma %190 * %162 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%219 = amdgpu.mfma %194 * %166 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%220 = amdgpu.mfma %197 * %169 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%221 = amdgpu.mfma %200 * %172 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%222 = amdgpu.mfma %203 * %175 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%223 = amdgpu.mfma %206 * %178 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%224 = amdgpu.mfma %209 * %181 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%225 = amdgpu.mfma %212 * %184 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32>
%227 = vector.insert %226, %215 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
scf.yield %227 : vector<2x2x1x1x4x1xf32>
}
%4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%6 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %6, %subview_5[%4, %5] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%8 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %8, %subview_5[%4, %7] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%9 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%10 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %10, %subview_5[%9, %5] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%11 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %11, %subview_5[%9, %7] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %subview_4 : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
memref.dealloc %subview : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16>
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32>
%cst_2 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%alloc = memref.alloc() : memref<128x68xf16, #gpu.address_space<workgroup>>
%subview = memref.subview %alloc[0, 0] [128, 64] [1, 1] : memref<128x68xf16, #gpu.address_space<workgroup>> to memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<64x132xf16, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %alloc_3[0, 0] [64, 128] [1, 1] : memref<64x132xf16, #gpu.address_space<workgroup>> to memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) {
gpu.barrier
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x]
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x]
%18 = vector.transfer_read %0[%16, %17], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x]
%20 = vector.transfer_read %0[%19, %17], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x]
%22 = vector.transfer_read %0[%21, %17], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x]
%24 = vector.transfer_read %0[%23, %17], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%25 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x]
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x]
%27 = vector.transfer_read %1[%25, %26], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x]
%29 = vector.transfer_read %1[%28, %26], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x]
%31 = vector.transfer_read %1[%30, %26], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%32 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x]
%33 = vector.transfer_read %1[%32, %26], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x]
%35 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %18, %alloc_3[%34, %35] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%36 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x]
%37 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %20, %alloc_3[%36, %37] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x]
%39 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %22, %alloc_3[%38, %39] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%40 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x]
%41 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %24, %alloc_3[%40, %41] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%42 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x]
%43 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %27, %alloc[%42, %43] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x]
%45 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %29, %alloc[%44, %45] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%46 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x]
%47 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %31, %alloc[%46, %47] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%48 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x]
%49 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %33, %alloc[%48, %49] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
gpu.barrier
%50 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%51 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%52 = vector.transfer_read %alloc_3[%50, %51], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%53 = vector.insert_strided_slice %52, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%54 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%55 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%56 = vector.transfer_read %alloc_3[%54, %55], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%57 = vector.insert_strided_slice %56, %53 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%58 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%60 = vector.transfer_read %alloc_3[%58, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%61 = vector.insert_strided_slice %60, %57 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%62 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%63 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%64 = vector.transfer_read %alloc_3[%62, %63], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%65 = vector.insert_strided_slice %64, %61 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%66 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%67 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%68 = vector.transfer_read %alloc_3[%66, %67], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%69 = vector.insert_strided_slice %68, %65 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%70 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%71 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%72 = vector.transfer_read %alloc_3[%70, %71], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%73 = vector.insert_strided_slice %72, %69 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%74 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%75 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%76 = vector.transfer_read %alloc_3[%74, %75], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%77 = vector.insert_strided_slice %76, %73 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%78 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%79 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%80 = vector.transfer_read %alloc_3[%78, %79], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%81 = vector.insert_strided_slice %80, %77 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%83 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%84 = vector.transfer_read %alloc_3[%82, %83], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%85 = vector.insert_strided_slice %84, %81 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%86 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%87 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%88 = vector.transfer_read %alloc_3[%86, %87], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%89 = vector.insert_strided_slice %88, %85 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%90 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%91 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%92 = vector.transfer_read %alloc_3[%90, %91], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%93 = vector.insert_strided_slice %92, %89 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%94 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%95 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%96 = vector.transfer_read %alloc_3[%94, %95], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%97 = vector.insert_strided_slice %96, %93 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%98 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%99 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%100 = vector.transfer_read %alloc_3[%98, %99], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%101 = vector.insert_strided_slice %100, %97 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%102 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%103 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%104 = vector.transfer_read %alloc_3[%102, %103], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%105 = vector.insert_strided_slice %104, %101 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%106 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%107 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%108 = vector.transfer_read %alloc_3[%106, %107], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%109 = vector.insert_strided_slice %108, %105 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%110 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%111 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%112 = vector.transfer_read %alloc_3[%110, %111], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%113 = vector.insert_strided_slice %112, %109 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%114 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%115 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%116 = vector.transfer_read %alloc[%114, %115], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%117 = vector.insert_strided_slice %116, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%118 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%119 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%120 = vector.transfer_read %alloc[%118, %119], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%121 = vector.insert_strided_slice %120, %117 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%122 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%123 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%124 = vector.transfer_read %alloc[%122, %123], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%125 = vector.insert_strided_slice %124, %121 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%126 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%127 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%128 = vector.transfer_read %alloc[%126, %127], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%129 = vector.insert_strided_slice %128, %125 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%130 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%131 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%132 = vector.transfer_read %alloc[%130, %131], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%133 = vector.insert_strided_slice %132, %129 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%134 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%135 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%136 = vector.transfer_read %alloc[%134, %135], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%137 = vector.insert_strided_slice %136, %133 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%138 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%139 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%140 = vector.transfer_read %alloc[%138, %139], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%141 = vector.insert_strided_slice %140, %137 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%142 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%143 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%144 = vector.transfer_read %alloc[%142, %143], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%145 = vector.insert_strided_slice %144, %141 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%146 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%147 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%148 = vector.transfer_read %alloc[%146, %147], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%149 = vector.insert_strided_slice %148, %145 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%150 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%151 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%152 = vector.transfer_read %alloc[%150, %151], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%153 = vector.insert_strided_slice %152, %149 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%154 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%155 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%156 = vector.transfer_read %alloc[%154, %155], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%157 = vector.insert_strided_slice %156, %153 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%158 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%159 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%160 = vector.transfer_read %alloc[%158, %159], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%161 = vector.insert_strided_slice %160, %157 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%162 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%163 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%164 = vector.transfer_read %alloc[%162, %163], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%165 = vector.insert_strided_slice %164, %161 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%166 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%167 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%168 = vector.transfer_read %alloc[%166, %167], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%169 = vector.insert_strided_slice %168, %165 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%170 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%171 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%172 = vector.transfer_read %alloc[%170, %171], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%173 = vector.insert_strided_slice %172, %169 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%174 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%175 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%176 = vector.transfer_read %alloc[%174, %175], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%177 = vector.insert_strided_slice %176, %173 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%178 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%179 = vector.extract %113[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%180 = vector.extract %177[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%181 = vector.shape_cast %179 : vector<1x1x1x4xf16> to vector<4xf16>
%182 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16>
%183 = vector.shape_cast %178 : vector<1x1x4x1xf32> to vector<4xf32>
%184 = amdgpu.mfma %181 * %182 + %183 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%185 = vector.extract %113[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%186 = vector.extract %177[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%187 = vector.shape_cast %185 : vector<1x1x1x4xf16> to vector<4xf16>
%188 = vector.shape_cast %186 : vector<1x1x4x1xf16> to vector<4xf16>
%189 = amdgpu.mfma %187 * %188 + %184 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%190 = vector.extract %113[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%191 = vector.extract %177[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%192 = vector.shape_cast %190 : vector<1x1x1x4xf16> to vector<4xf16>
%193 = vector.shape_cast %191 : vector<1x1x4x1xf16> to vector<4xf16>
%194 = amdgpu.mfma %192 * %193 + %189 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%195 = vector.extract %113[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%196 = vector.extract %177[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%197 = vector.shape_cast %195 : vector<1x1x1x4xf16> to vector<4xf16>
%198 = vector.shape_cast %196 : vector<1x1x4x1xf16> to vector<4xf16>
%199 = amdgpu.mfma %197 * %198 + %194 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%200 = vector.extract %113[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%201 = vector.extract %177[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%202 = vector.shape_cast %200 : vector<1x1x1x4xf16> to vector<4xf16>
%203 = vector.shape_cast %201 : vector<1x1x4x1xf16> to vector<4xf16>
%204 = amdgpu.mfma %202 * %203 + %199 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%205 = vector.extract %113[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%206 = vector.extract %177[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%207 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16>
%208 = vector.shape_cast %206 : vector<1x1x4x1xf16> to vector<4xf16>
%209 = amdgpu.mfma %207 * %208 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%210 = vector.extract %113[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%211 = vector.extract %177[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%212 = vector.shape_cast %210 : vector<1x1x1x4xf16> to vector<4xf16>
%213 = vector.shape_cast %211 : vector<1x1x4x1xf16> to vector<4xf16>
%214 = amdgpu.mfma %212 * %213 + %209 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%215 = vector.extract %113[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%216 = vector.extract %177[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%217 = vector.shape_cast %215 : vector<1x1x1x4xf16> to vector<4xf16>
%218 = vector.shape_cast %216 : vector<1x1x4x1xf16> to vector<4xf16>
%219 = amdgpu.mfma %217 * %218 + %214 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%220 = vector.shape_cast %219 : vector<4xf32> to vector<1x1x4x1xf32>
%221 = vector.insert %220, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%222 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%223 = vector.extract %177[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%224 = vector.shape_cast %223 : vector<1x1x4x1xf16> to vector<4xf16>
%225 = vector.shape_cast %222 : vector<1x1x4x1xf32> to vector<4xf32>
%226 = amdgpu.mfma %181 * %224 + %225 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%227 = vector.extract %177[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%228 = vector.shape_cast %227 : vector<1x1x4x1xf16> to vector<4xf16>
%229 = amdgpu.mfma %187 * %228 + %226 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%230 = vector.extract %177[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%231 = vector.shape_cast %230 : vector<1x1x4x1xf16> to vector<4xf16>
%232 = amdgpu.mfma %192 * %231 + %229 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%233 = vector.extract %177[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%234 = vector.shape_cast %233 : vector<1x1x4x1xf16> to vector<4xf16>
%235 = amdgpu.mfma %197 * %234 + %232 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%236 = vector.extract %177[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%237 = vector.shape_cast %236 : vector<1x1x4x1xf16> to vector<4xf16>
%238 = amdgpu.mfma %202 * %237 + %235 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%239 = vector.extract %177[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%240 = vector.shape_cast %239 : vector<1x1x4x1xf16> to vector<4xf16>
%241 = amdgpu.mfma %207 * %240 + %238 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%242 = vector.extract %177[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%243 = vector.shape_cast %242 : vector<1x1x4x1xf16> to vector<4xf16>
%244 = amdgpu.mfma %212 * %243 + %241 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%245 = vector.extract %177[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%246 = vector.shape_cast %245 : vector<1x1x4x1xf16> to vector<4xf16>
%247 = amdgpu.mfma %217 * %246 + %244 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%248 = vector.shape_cast %247 : vector<4xf32> to vector<1x1x4x1xf32>
%249 = vector.insert %248, %221 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%250 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%251 = vector.extract %113[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%252 = vector.shape_cast %251 : vector<1x1x1x4xf16> to vector<4xf16>
%253 = vector.shape_cast %250 : vector<1x1x4x1xf32> to vector<4xf32>
%254 = amdgpu.mfma %252 * %182 + %253 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%255 = vector.extract %113[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%256 = vector.shape_cast %255 : vector<1x1x1x4xf16> to vector<4xf16>
%257 = amdgpu.mfma %256 * %188 + %254 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%258 = vector.extract %113[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%259 = vector.shape_cast %258 : vector<1x1x1x4xf16> to vector<4xf16>
%260 = amdgpu.mfma %259 * %193 + %257 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%261 = vector.extract %113[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%262 = vector.shape_cast %261 : vector<1x1x1x4xf16> to vector<4xf16>
%263 = amdgpu.mfma %262 * %198 + %260 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%264 = vector.extract %113[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%265 = vector.shape_cast %264 : vector<1x1x1x4xf16> to vector<4xf16>
%266 = amdgpu.mfma %265 * %203 + %263 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%267 = vector.extract %113[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%268 = vector.shape_cast %267 : vector<1x1x1x4xf16> to vector<4xf16>
%269 = amdgpu.mfma %268 * %208 + %266 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%270 = vector.extract %113[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%271 = vector.shape_cast %270 : vector<1x1x1x4xf16> to vector<4xf16>
%272 = amdgpu.mfma %271 * %213 + %269 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%273 = vector.extract %113[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%274 = vector.shape_cast %273 : vector<1x1x1x4xf16> to vector<4xf16>
%275 = amdgpu.mfma %274 * %218 + %272 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%276 = vector.shape_cast %275 : vector<4xf32> to vector<1x1x4x1xf32>
%277 = vector.insert %276, %249 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%278 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%279 = vector.shape_cast %278 : vector<1x1x4x1xf32> to vector<4xf32>
%280 = amdgpu.mfma %252 * %224 + %279 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%281 = amdgpu.mfma %256 * %228 + %280 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%282 = amdgpu.mfma %259 * %231 + %281 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%283 = amdgpu.mfma %262 * %234 + %282 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%284 = amdgpu.mfma %265 * %237 + %283 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%285 = amdgpu.mfma %268 * %240 + %284 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%286 = amdgpu.mfma %271 * %243 + %285 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%287 = amdgpu.mfma %274 * %246 + %286 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%288 = vector.shape_cast %287 : vector<4xf32> to vector<1x1x4x1xf32>
%289 = vector.insert %288, %277 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
scf.yield %289 : vector<2x2x1x1x4x1xf32>
}
%4 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16)>()[%arg0, %thread_id_x]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16)>()[%arg1, %thread_id_x]
vector.transfer_write %4, %2[%5, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%7 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%8 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16)>()[%arg0, %thread_id_x]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16 + 16)>()[%arg1, %thread_id_x]
vector.transfer_write %7, %2[%8, %9] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%10 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%11 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16 + 16)>()[%arg0, %thread_id_x]
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16)>()[%arg1, %thread_id_x]
vector.transfer_write %10, %2[%11, %12] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%13 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%14 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16 + 16)>()[%arg0, %thread_id_x]
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16 + 16)>()[%arg1, %thread_id_x]
vector.transfer_write %13, %2[%14, %15] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %subview_4 : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
memref.dealloc %subview : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16>
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32>
%cst_2 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%alloc = memref.alloc() : memref<128x68xf16, #gpu.address_space<workgroup>>
%subview = memref.subview %alloc[0, 0] [128, 64] [1, 1] : memref<128x68xf16, #gpu.address_space<workgroup>> to memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<64x132xf16, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %alloc_3[0, 0] [64, 128] [1, 1] : memref<64x132xf16, #gpu.address_space<workgroup>> to memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) {
gpu.barrier
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x]
%13 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x]
%14 = vector.transfer_read %0[%12, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x]
%16 = vector.transfer_read %0[%15, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x]
%18 = vector.transfer_read %0[%17, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x]
%20 = vector.transfer_read %0[%19, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x]
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x]
%23 = vector.transfer_read %1[%21, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x]
%25 = vector.transfer_read %1[%24, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x]
%27 = vector.transfer_read %1[%26, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x]
%29 = vector.transfer_read %1[%28, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x]
%31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %14, %alloc_3[%30, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x]
vector.transfer_write %16, %alloc_3[%32, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%33 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x]
vector.transfer_write %18, %alloc_3[%33, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x]
vector.transfer_write %20, %alloc_3[%34, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x]
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %23, %alloc[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x]
vector.transfer_write %25, %alloc[%37, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x]
vector.transfer_write %27, %alloc[%38, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%39 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x]
vector.transfer_write %29, %alloc[%39, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
gpu.barrier
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%42 = vector.transfer_read %alloc_3[%40, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%43 = vector.insert_strided_slice %42, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%45 = vector.transfer_read %alloc_3[%40, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%46 = vector.insert_strided_slice %45, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%48 = vector.transfer_read %alloc_3[%40, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%49 = vector.insert_strided_slice %48, %46 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%51 = vector.transfer_read %alloc_3[%40, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%52 = vector.insert_strided_slice %51, %49 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%54 = vector.transfer_read %alloc_3[%40, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%55 = vector.insert_strided_slice %54, %52 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%56 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%57 = vector.transfer_read %alloc_3[%40, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%60 = vector.transfer_read %alloc_3[%40, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%63 = vector.transfer_read %alloc_3[%40, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%66 = vector.transfer_read %alloc_3[%65, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%67 = vector.insert_strided_slice %66, %64 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%68 = vector.transfer_read %alloc_3[%65, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%69 = vector.insert_strided_slice %68, %67 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%70 = vector.transfer_read %alloc_3[%65, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%71 = vector.insert_strided_slice %70, %69 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%72 = vector.transfer_read %alloc_3[%65, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%73 = vector.insert_strided_slice %72, %71 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%74 = vector.transfer_read %alloc_3[%65, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%75 = vector.insert_strided_slice %74, %73 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%76 = vector.transfer_read %alloc_3[%65, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%77 = vector.insert_strided_slice %76, %75 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%78 = vector.transfer_read %alloc_3[%65, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%79 = vector.insert_strided_slice %78, %77 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%80 = vector.transfer_read %alloc_3[%65, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%81 = vector.insert_strided_slice %80, %79 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%83 = vector.transfer_read %alloc[%41, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%84 = vector.insert_strided_slice %83, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%86 = vector.transfer_read %alloc[%41, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%87 = vector.insert_strided_slice %86, %84 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%88 = vector.transfer_read %alloc[%44, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%89 = vector.insert_strided_slice %88, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%90 = vector.transfer_read %alloc[%44, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%91 = vector.insert_strided_slice %90, %89 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%92 = vector.transfer_read %alloc[%47, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%93 = vector.insert_strided_slice %92, %91 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%94 = vector.transfer_read %alloc[%47, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%95 = vector.insert_strided_slice %94, %93 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%96 = vector.transfer_read %alloc[%50, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%97 = vector.insert_strided_slice %96, %95 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%98 = vector.transfer_read %alloc[%50, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%99 = vector.insert_strided_slice %98, %97 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%100 = vector.transfer_read %alloc[%53, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%101 = vector.insert_strided_slice %100, %99 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%102 = vector.transfer_read %alloc[%53, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%103 = vector.insert_strided_slice %102, %101 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%104 = vector.transfer_read %alloc[%56, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%105 = vector.insert_strided_slice %104, %103 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%106 = vector.transfer_read %alloc[%56, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%107 = vector.insert_strided_slice %106, %105 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%108 = vector.transfer_read %alloc[%59, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%109 = vector.insert_strided_slice %108, %107 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%110 = vector.transfer_read %alloc[%59, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%111 = vector.insert_strided_slice %110, %109 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%112 = vector.transfer_read %alloc[%62, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%113 = vector.insert_strided_slice %112, %111 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%114 = vector.transfer_read %alloc[%62, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%115 = vector.insert_strided_slice %114, %113 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%116 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%117 = vector.extract %81[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%118 = vector.extract %115[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%119 = vector.shape_cast %117 : vector<1x1x1x4xf16> to vector<4xf16>
%120 = vector.shape_cast %118 : vector<1x1x4x1xf16> to vector<4xf16>
%121 = vector.shape_cast %116 : vector<1x1x4x1xf32> to vector<4xf32>
%122 = amdgpu.mfma %119 * %120 + %121 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%123 = vector.extract %81[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%124 = vector.extract %115[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%125 = vector.shape_cast %123 : vector<1x1x1x4xf16> to vector<4xf16>
%126 = vector.shape_cast %124 : vector<1x1x4x1xf16> to vector<4xf16>
%127 = amdgpu.mfma %125 * %126 + %122 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%128 = vector.extract %81[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%129 = vector.extract %115[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%130 = vector.shape_cast %128 : vector<1x1x1x4xf16> to vector<4xf16>
%131 = vector.shape_cast %129 : vector<1x1x4x1xf16> to vector<4xf16>
%132 = amdgpu.mfma %130 * %131 + %127 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%133 = vector.extract %81[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%134 = vector.extract %115[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%135 = vector.shape_cast %133 : vector<1x1x1x4xf16> to vector<4xf16>
%136 = vector.shape_cast %134 : vector<1x1x4x1xf16> to vector<4xf16>
%137 = amdgpu.mfma %135 * %136 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%138 = vector.extract %81[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%139 = vector.extract %115[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%140 = vector.shape_cast %138 : vector<1x1x1x4xf16> to vector<4xf16>
%141 = vector.shape_cast %139 : vector<1x1x4x1xf16> to vector<4xf16>
%142 = amdgpu.mfma %140 * %141 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%143 = vector.extract %81[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%144 = vector.extract %115[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%145 = vector.shape_cast %143 : vector<1x1x1x4xf16> to vector<4xf16>
%146 = vector.shape_cast %144 : vector<1x1x4x1xf16> to vector<4xf16>
%147 = amdgpu.mfma %145 * %146 + %142 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%148 = vector.extract %81[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%149 = vector.extract %115[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%150 = vector.shape_cast %148 : vector<1x1x1x4xf16> to vector<4xf16>
%151 = vector.shape_cast %149 : vector<1x1x4x1xf16> to vector<4xf16>
%152 = amdgpu.mfma %150 * %151 + %147 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%153 = vector.extract %81[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%154 = vector.extract %115[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%155 = vector.shape_cast %153 : vector<1x1x1x4xf16> to vector<4xf16>
%156 = vector.shape_cast %154 : vector<1x1x4x1xf16> to vector<4xf16>
%157 = amdgpu.mfma %155 * %156 + %152 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%158 = vector.shape_cast %157 : vector<4xf32> to vector<1x1x4x1xf32>
%159 = vector.insert %158, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%160 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%161 = vector.extract %115[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%162 = vector.shape_cast %161 : vector<1x1x4x1xf16> to vector<4xf16>
%163 = vector.shape_cast %160 : vector<1x1x4x1xf32> to vector<4xf32>
%164 = amdgpu.mfma %119 * %162 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%165 = vector.extract %115[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%166 = vector.shape_cast %165 : vector<1x1x4x1xf16> to vector<4xf16>
%167 = amdgpu.mfma %125 * %166 + %164 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%168 = vector.extract %115[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%169 = vector.shape_cast %168 : vector<1x1x4x1xf16> to vector<4xf16>
%170 = amdgpu.mfma %130 * %169 + %167 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%171 = vector.extract %115[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%172 = vector.shape_cast %171 : vector<1x1x4x1xf16> to vector<4xf16>
%173 = amdgpu.mfma %135 * %172 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%174 = vector.extract %115[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%175 = vector.shape_cast %174 : vector<1x1x4x1xf16> to vector<4xf16>
%176 = amdgpu.mfma %140 * %175 + %173 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%177 = vector.extract %115[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%178 = vector.shape_cast %177 : vector<1x1x4x1xf16> to vector<4xf16>
%179 = amdgpu.mfma %145 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%180 = vector.extract %115[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%181 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16>
%182 = amdgpu.mfma %150 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%183 = vector.extract %115[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%184 = vector.shape_cast %183 : vector<1x1x4x1xf16> to vector<4xf16>
%185 = amdgpu.mfma %155 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x4x1xf32>
%187 = vector.insert %186, %159 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%188 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%189 = vector.extract %81[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%190 = vector.shape_cast %189 : vector<1x1x1x4xf16> to vector<4xf16>
%191 = vector.shape_cast %188 : vector<1x1x4x1xf32> to vector<4xf32>
%192 = amdgpu.mfma %190 * %120 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%193 = vector.extract %81[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%194 = vector.shape_cast %193 : vector<1x1x1x4xf16> to vector<4xf16>
%195 = amdgpu.mfma %194 * %126 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%196 = vector.extract %81[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%197 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16>
%198 = amdgpu.mfma %197 * %131 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%199 = vector.extract %81[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%200 = vector.shape_cast %199 : vector<1x1x1x4xf16> to vector<4xf16>
%201 = amdgpu.mfma %200 * %136 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%202 = vector.extract %81[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%203 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16>
%204 = amdgpu.mfma %203 * %141 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%205 = vector.extract %81[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%206 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16>
%207 = amdgpu.mfma %206 * %146 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%208 = vector.extract %81[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%209 = vector.shape_cast %208 : vector<1x1x1x4xf16> to vector<4xf16>
%210 = amdgpu.mfma %209 * %151 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%211 = vector.extract %81[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%212 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16>
%213 = amdgpu.mfma %212 * %156 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%214 = vector.shape_cast %213 : vector<4xf32> to vector<1x1x4x1xf32>
%215 = vector.insert %214, %187 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%216 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%217 = vector.shape_cast %216 : vector<1x1x4x1xf32> to vector<4xf32>
%218 = amdgpu.mfma %190 * %162 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%219 = amdgpu.mfma %194 * %166 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%220 = amdgpu.mfma %197 * %169 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%221 = amdgpu.mfma %200 * %172 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%222 = amdgpu.mfma %203 * %175 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%223 = amdgpu.mfma %206 * %178 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%224 = amdgpu.mfma %209 * %181 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%225 = amdgpu.mfma %212 * %184 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32>
%227 = vector.insert %226, %215 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
scf.yield %227 : vector<2x2x1x1x4x1xf32>
}
%4 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16)>()[%arg0, %thread_id_x]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16)>()[%arg1, %thread_id_x]
vector.transfer_write %4, %2[%5, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%7 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%8 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16 + 16)>()[%arg1, %thread_id_x]
vector.transfer_write %7, %2[%5, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%9 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%10 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16 + 16)>()[%arg0, %thread_id_x]
vector.transfer_write %9, %2[%10, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%11 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %11, %2[%10, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %subview_4 : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
memref.dealloc %subview : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16>
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32>
%cst_2 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%alloc = memref.alloc() : memref<128x68xf16, #gpu.address_space<workgroup>>
%subview = memref.subview %alloc[0, 0] [128, 64] [1, 1] : memref<128x68xf16, #gpu.address_space<workgroup>> to memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<64x132xf16, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %alloc_3[0, 0] [64, 128] [1, 1] : memref<64x132xf16, #gpu.address_space<workgroup>> to memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) {
gpu.barrier
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x]
%13 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x]
%14 = vector.transfer_read %0[%12, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x]
%16 = vector.transfer_read %0[%15, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x]
%18 = vector.transfer_read %0[%17, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x]
%20 = vector.transfer_read %0[%19, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x]
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x]
%23 = vector.transfer_read %1[%21, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x]
%25 = vector.transfer_read %1[%24, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x]
%27 = vector.transfer_read %1[%26, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x]
%29 = vector.transfer_read %1[%28, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x]
%31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %14, %alloc_3[%30, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x]
vector.transfer_write %16, %alloc_3[%32, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%33 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x]
vector.transfer_write %18, %alloc_3[%33, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x]
vector.transfer_write %20, %alloc_3[%34, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x]
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %23, %alloc[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x]
vector.transfer_write %25, %alloc[%37, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x]
vector.transfer_write %27, %alloc[%38, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%39 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x]
vector.transfer_write %29, %alloc[%39, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
gpu.barrier
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%42 = vector.transfer_read %alloc_3[%40, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%43 = vector.insert_strided_slice %42, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%45 = vector.transfer_read %alloc_3[%40, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%46 = vector.insert_strided_slice %45, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%48 = vector.transfer_read %alloc_3[%40, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%49 = vector.insert_strided_slice %48, %46 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%51 = vector.transfer_read %alloc_3[%40, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%52 = vector.insert_strided_slice %51, %49 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%54 = vector.transfer_read %alloc_3[%40, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%55 = vector.insert_strided_slice %54, %52 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%56 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%57 = vector.transfer_read %alloc_3[%40, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%60 = vector.transfer_read %alloc_3[%40, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%63 = vector.transfer_read %alloc_3[%40, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%66 = vector.transfer_read %alloc_3[%65, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%67 = vector.insert_strided_slice %66, %64 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%68 = vector.transfer_read %alloc_3[%65, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%69 = vector.insert_strided_slice %68, %67 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%70 = vector.transfer_read %alloc_3[%65, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%71 = vector.insert_strided_slice %70, %69 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%72 = vector.transfer_read %alloc_3[%65, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%73 = vector.insert_strided_slice %72, %71 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%74 = vector.transfer_read %alloc_3[%65, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%75 = vector.insert_strided_slice %74, %73 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%76 = vector.transfer_read %alloc_3[%65, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%77 = vector.insert_strided_slice %76, %75 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%78 = vector.transfer_read %alloc_3[%65, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%79 = vector.insert_strided_slice %78, %77 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%80 = vector.transfer_read %alloc_3[%65, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%81 = vector.insert_strided_slice %80, %79 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%83 = vector.transfer_read %alloc[%41, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%84 = vector.insert_strided_slice %83, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%86 = vector.transfer_read %alloc[%41, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%87 = vector.insert_strided_slice %86, %84 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%88 = vector.transfer_read %alloc[%44, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%89 = vector.insert_strided_slice %88, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%90 = vector.transfer_read %alloc[%44, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%91 = vector.insert_strided_slice %90, %89 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%92 = vector.transfer_read %alloc[%47, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%93 = vector.insert_strided_slice %92, %91 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%94 = vector.transfer_read %alloc[%47, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%95 = vector.insert_strided_slice %94, %93 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%96 = vector.transfer_read %alloc[%50, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%97 = vector.insert_strided_slice %96, %95 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%98 = vector.transfer_read %alloc[%50, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%99 = vector.insert_strided_slice %98, %97 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%100 = vector.transfer_read %alloc[%53, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%101 = vector.insert_strided_slice %100, %99 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%102 = vector.transfer_read %alloc[%53, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%103 = vector.insert_strided_slice %102, %101 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%104 = vector.transfer_read %alloc[%56, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%105 = vector.insert_strided_slice %104, %103 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%106 = vector.transfer_read %alloc[%56, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%107 = vector.insert_strided_slice %106, %105 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%108 = vector.transfer_read %alloc[%59, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%109 = vector.insert_strided_slice %108, %107 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%110 = vector.transfer_read %alloc[%59, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%111 = vector.insert_strided_slice %110, %109 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%112 = vector.transfer_read %alloc[%62, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%113 = vector.insert_strided_slice %112, %111 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%114 = vector.transfer_read %alloc[%62, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%115 = vector.insert_strided_slice %114, %113 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%116 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%117 = vector.extract %81[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%118 = vector.extract %115[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%119 = vector.shape_cast %117 : vector<1x1x1x4xf16> to vector<4xf16>
%120 = vector.shape_cast %118 : vector<1x1x4x1xf16> to vector<4xf16>
%121 = vector.shape_cast %116 : vector<1x1x4x1xf32> to vector<4xf32>
%122 = amdgpu.mfma %119 * %120 + %121 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%123 = vector.extract %81[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%124 = vector.extract %115[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%125 = vector.shape_cast %123 : vector<1x1x1x4xf16> to vector<4xf16>
%126 = vector.shape_cast %124 : vector<1x1x4x1xf16> to vector<4xf16>
%127 = amdgpu.mfma %125 * %126 + %122 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%128 = vector.extract %81[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%129 = vector.extract %115[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%130 = vector.shape_cast %128 : vector<1x1x1x4xf16> to vector<4xf16>
%131 = vector.shape_cast %129 : vector<1x1x4x1xf16> to vector<4xf16>
%132 = amdgpu.mfma %130 * %131 + %127 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%133 = vector.extract %81[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%134 = vector.extract %115[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%135 = vector.shape_cast %133 : vector<1x1x1x4xf16> to vector<4xf16>
%136 = vector.shape_cast %134 : vector<1x1x4x1xf16> to vector<4xf16>
%137 = amdgpu.mfma %135 * %136 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%138 = vector.extract %81[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%139 = vector.extract %115[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%140 = vector.shape_cast %138 : vector<1x1x1x4xf16> to vector<4xf16>
%141 = vector.shape_cast %139 : vector<1x1x4x1xf16> to vector<4xf16>
%142 = amdgpu.mfma %140 * %141 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%143 = vector.extract %81[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%144 = vector.extract %115[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%145 = vector.shape_cast %143 : vector<1x1x1x4xf16> to vector<4xf16>
%146 = vector.shape_cast %144 : vector<1x1x4x1xf16> to vector<4xf16>
%147 = amdgpu.mfma %145 * %146 + %142 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%148 = vector.extract %81[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%149 = vector.extract %115[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%150 = vector.shape_cast %148 : vector<1x1x1x4xf16> to vector<4xf16>
%151 = vector.shape_cast %149 : vector<1x1x4x1xf16> to vector<4xf16>
%152 = amdgpu.mfma %150 * %151 + %147 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%153 = vector.extract %81[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%154 = vector.extract %115[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%155 = vector.shape_cast %153 : vector<1x1x1x4xf16> to vector<4xf16>
%156 = vector.shape_cast %154 : vector<1x1x4x1xf16> to vector<4xf16>
%157 = amdgpu.mfma %155 * %156 + %152 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%158 = vector.shape_cast %157 : vector<4xf32> to vector<1x1x4x1xf32>
%159 = vector.insert %158, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%160 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%161 = vector.extract %115[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%162 = vector.shape_cast %161 : vector<1x1x4x1xf16> to vector<4xf16>
%163 = vector.shape_cast %160 : vector<1x1x4x1xf32> to vector<4xf32>
%164 = amdgpu.mfma %119 * %162 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%165 = vector.extract %115[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%166 = vector.shape_cast %165 : vector<1x1x4x1xf16> to vector<4xf16>
%167 = amdgpu.mfma %125 * %166 + %164 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%168 = vector.extract %115[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%169 = vector.shape_cast %168 : vector<1x1x4x1xf16> to vector<4xf16>
%170 = amdgpu.mfma %130 * %169 + %167 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%171 = vector.extract %115[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%172 = vector.shape_cast %171 : vector<1x1x4x1xf16> to vector<4xf16>
%173 = amdgpu.mfma %135 * %172 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%174 = vector.extract %115[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%175 = vector.shape_cast %174 : vector<1x1x4x1xf16> to vector<4xf16>
%176 = amdgpu.mfma %140 * %175 + %173 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%177 = vector.extract %115[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%178 = vector.shape_cast %177 : vector<1x1x4x1xf16> to vector<4xf16>
%179 = amdgpu.mfma %145 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%180 = vector.extract %115[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%181 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16>
%182 = amdgpu.mfma %150 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%183 = vector.extract %115[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%184 = vector.shape_cast %183 : vector<1x1x4x1xf16> to vector<4xf16>
%185 = amdgpu.mfma %155 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x4x1xf32>
%187 = vector.insert %186, %159 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%188 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%189 = vector.extract %81[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%190 = vector.shape_cast %189 : vector<1x1x1x4xf16> to vector<4xf16>
%191 = vector.shape_cast %188 : vector<1x1x4x1xf32> to vector<4xf32>
%192 = amdgpu.mfma %190 * %120 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%193 = vector.extract %81[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%194 = vector.shape_cast %193 : vector<1x1x1x4xf16> to vector<4xf16>
%195 = amdgpu.mfma %194 * %126 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%196 = vector.extract %81[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%197 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16>
%198 = amdgpu.mfma %197 * %131 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%199 = vector.extract %81[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%200 = vector.shape_cast %199 : vector<1x1x1x4xf16> to vector<4xf16>
%201 = amdgpu.mfma %200 * %136 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%202 = vector.extract %81[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%203 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16>
%204 = amdgpu.mfma %203 * %141 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%205 = vector.extract %81[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%206 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16>
%207 = amdgpu.mfma %206 * %146 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%208 = vector.extract %81[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%209 = vector.shape_cast %208 : vector<1x1x1x4xf16> to vector<4xf16>
%210 = amdgpu.mfma %209 * %151 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%211 = vector.extract %81[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%212 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16>
%213 = amdgpu.mfma %212 * %156 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%214 = vector.shape_cast %213 : vector<4xf32> to vector<1x1x4x1xf32>
%215 = vector.insert %214, %187 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%216 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%217 = vector.shape_cast %216 : vector<1x1x4x1xf32> to vector<4xf32>
%218 = amdgpu.mfma %190 * %162 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%219 = amdgpu.mfma %194 * %166 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%220 = amdgpu.mfma %197 * %169 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%221 = amdgpu.mfma %200 * %172 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%222 = amdgpu.mfma %203 * %175 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%223 = amdgpu.mfma %206 * %178 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%224 = amdgpu.mfma %209 * %181 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%225 = amdgpu.mfma %212 * %184 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32>
%227 = vector.insert %226, %215 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
scf.yield %227 : vector<2x2x1x1x4x1xf32>
}
%4 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16)>()[%arg0, %thread_id_x]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16)>()[%arg1, %thread_id_x]
vector.transfer_write %4, %2[%5, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%7 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%8 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16 + 16)>()[%arg1, %thread_id_x]
vector.transfer_write %7, %2[%5, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%9 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%10 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16 + 16)>()[%arg0, %thread_id_x]
vector.transfer_write %9, %2[%10, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%11 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %11, %2[%10, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %subview_4 : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
memref.dealloc %subview : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16>
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32>
%cst_2 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%alloc = memref.alloc() : memref<128x68xf16, #gpu.address_space<workgroup>>
%subview = memref.subview %alloc[0, 0] [128, 64] [1, 1] : memref<128x68xf16, #gpu.address_space<workgroup>> to memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<64x132xf16, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %alloc_3[0, 0] [64, 128] [1, 1] : memref<64x132xf16, #gpu.address_space<workgroup>> to memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) {
gpu.barrier
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x]
%13 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x]
%14 = vector.transfer_read %0[%12, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x]
%16 = vector.transfer_read %0[%15, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x]
%18 = vector.transfer_read %0[%17, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x]
%20 = vector.transfer_read %0[%19, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x]
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x]
%23 = vector.transfer_read %1[%21, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x]
%25 = vector.transfer_read %1[%24, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x]
%27 = vector.transfer_read %1[%26, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x]
%29 = vector.transfer_read %1[%28, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x]
%31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %14, %alloc_3[%30, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x]
vector.transfer_write %16, %alloc_3[%32, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%33 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x]
vector.transfer_write %18, %alloc_3[%33, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x]
vector.transfer_write %20, %alloc_3[%34, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x]
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %23, %alloc[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x]
vector.transfer_write %25, %alloc[%37, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x]
vector.transfer_write %27, %alloc[%38, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%39 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x]
vector.transfer_write %29, %alloc[%39, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
gpu.barrier
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%42 = vector.transfer_read %alloc_3[%40, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%43 = vector.insert_strided_slice %42, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%45 = vector.transfer_read %alloc_3[%40, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%46 = vector.insert_strided_slice %45, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%48 = vector.transfer_read %alloc_3[%40, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%49 = vector.insert_strided_slice %48, %46 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%51 = vector.transfer_read %alloc_3[%40, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%52 = vector.insert_strided_slice %51, %49 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%54 = vector.transfer_read %alloc_3[%40, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%55 = vector.insert_strided_slice %54, %52 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%56 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%57 = vector.transfer_read %alloc_3[%40, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%60 = vector.transfer_read %alloc_3[%40, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%63 = vector.transfer_read %alloc_3[%40, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%66 = vector.transfer_read %alloc_3[%65, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%67 = vector.insert_strided_slice %66, %64 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%68 = vector.transfer_read %alloc_3[%65, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%69 = vector.insert_strided_slice %68, %67 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%70 = vector.transfer_read %alloc_3[%65, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%71 = vector.insert_strided_slice %70, %69 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%72 = vector.transfer_read %alloc_3[%65, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%73 = vector.insert_strided_slice %72, %71 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%74 = vector.transfer_read %alloc_3[%65, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%75 = vector.insert_strided_slice %74, %73 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%76 = vector.transfer_read %alloc_3[%65, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%77 = vector.insert_strided_slice %76, %75 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%78 = vector.transfer_read %alloc_3[%65, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%79 = vector.insert_strided_slice %78, %77 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%80 = vector.transfer_read %alloc_3[%65, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%81 = vector.insert_strided_slice %80, %79 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%83 = vector.transfer_read %alloc[%41, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%84 = vector.insert_strided_slice %83, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%86 = vector.transfer_read %alloc[%41, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%87 = vector.insert_strided_slice %86, %84 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%88 = vector.transfer_read %alloc[%44, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%89 = vector.insert_strided_slice %88, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%90 = vector.transfer_read %alloc[%44, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%91 = vector.insert_strided_slice %90, %89 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%92 = vector.transfer_read %alloc[%47, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%93 = vector.insert_strided_slice %92, %91 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%94 = vector.transfer_read %alloc[%47, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%95 = vector.insert_strided_slice %94, %93 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%96 = vector.transfer_read %alloc[%50, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%97 = vector.insert_strided_slice %96, %95 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%98 = vector.transfer_read %alloc[%50, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%99 = vector.insert_strided_slice %98, %97 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%100 = vector.transfer_read %alloc[%53, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%101 = vector.insert_strided_slice %100, %99 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%102 = vector.transfer_read %alloc[%53, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%103 = vector.insert_strided_slice %102, %101 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%104 = vector.transfer_read %alloc[%56, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%105 = vector.insert_strided_slice %104, %103 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%106 = vector.transfer_read %alloc[%56, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%107 = vector.insert_strided_slice %106, %105 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%108 = vector.transfer_read %alloc[%59, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%109 = vector.insert_strided_slice %108, %107 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%110 = vector.transfer_read %alloc[%59, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%111 = vector.insert_strided_slice %110, %109 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%112 = vector.transfer_read %alloc[%62, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%113 = vector.insert_strided_slice %112, %111 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%114 = vector.transfer_read %alloc[%62, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%115 = vector.insert_strided_slice %114, %113 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%116 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%117 = vector.extract %81[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%118 = vector.extract %115[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%119 = vector.shape_cast %117 : vector<1x1x1x4xf16> to vector<4xf16>
%120 = vector.shape_cast %118 : vector<1x1x4x1xf16> to vector<4xf16>
%121 = vector.shape_cast %116 : vector<1x1x4x1xf32> to vector<4xf32>
%122 = amdgpu.mfma %119 * %120 + %121 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%123 = vector.extract %81[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%124 = vector.extract %115[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%125 = vector.shape_cast %123 : vector<1x1x1x4xf16> to vector<4xf16>
%126 = vector.shape_cast %124 : vector<1x1x4x1xf16> to vector<4xf16>
%127 = amdgpu.mfma %125 * %126 + %122 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%128 = vector.extract %81[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%129 = vector.extract %115[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%130 = vector.shape_cast %128 : vector<1x1x1x4xf16> to vector<4xf16>
%131 = vector.shape_cast %129 : vector<1x1x4x1xf16> to vector<4xf16>
%132 = amdgpu.mfma %130 * %131 + %127 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%133 = vector.extract %81[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%134 = vector.extract %115[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%135 = vector.shape_cast %133 : vector<1x1x1x4xf16> to vector<4xf16>
%136 = vector.shape_cast %134 : vector<1x1x4x1xf16> to vector<4xf16>
%137 = amdgpu.mfma %135 * %136 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%138 = vector.extract %81[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%139 = vector.extract %115[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%140 = vector.shape_cast %138 : vector<1x1x1x4xf16> to vector<4xf16>
%141 = vector.shape_cast %139 : vector<1x1x4x1xf16> to vector<4xf16>
%142 = amdgpu.mfma %140 * %141 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%143 = vector.extract %81[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%144 = vector.extract %115[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%145 = vector.shape_cast %143 : vector<1x1x1x4xf16> to vector<4xf16>
%146 = vector.shape_cast %144 : vector<1x1x4x1xf16> to vector<4xf16>
%147 = amdgpu.mfma %145 * %146 + %142 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%148 = vector.extract %81[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%149 = vector.extract %115[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%150 = vector.shape_cast %148 : vector<1x1x1x4xf16> to vector<4xf16>
%151 = vector.shape_cast %149 : vector<1x1x4x1xf16> to vector<4xf16>
%152 = amdgpu.mfma %150 * %151 + %147 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%153 = vector.extract %81[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%154 = vector.extract %115[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%155 = vector.shape_cast %153 : vector<1x1x1x4xf16> to vector<4xf16>
%156 = vector.shape_cast %154 : vector<1x1x4x1xf16> to vector<4xf16>
%157 = amdgpu.mfma %155 * %156 + %152 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%158 = vector.shape_cast %157 : vector<4xf32> to vector<1x1x4x1xf32>
%159 = vector.insert %158, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%160 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%161 = vector.extract %115[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%162 = vector.shape_cast %161 : vector<1x1x4x1xf16> to vector<4xf16>
%163 = vector.shape_cast %160 : vector<1x1x4x1xf32> to vector<4xf32>
%164 = amdgpu.mfma %119 * %162 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%165 = vector.extract %115[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%166 = vector.shape_cast %165 : vector<1x1x4x1xf16> to vector<4xf16>
%167 = amdgpu.mfma %125 * %166 + %164 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%168 = vector.extract %115[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%169 = vector.shape_cast %168 : vector<1x1x4x1xf16> to vector<4xf16>
%170 = amdgpu.mfma %130 * %169 + %167 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%171 = vector.extract %115[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%172 = vector.shape_cast %171 : vector<1x1x4x1xf16> to vector<4xf16>
%173 = amdgpu.mfma %135 * %172 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%174 = vector.extract %115[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%175 = vector.shape_cast %174 : vector<1x1x4x1xf16> to vector<4xf16>
%176 = amdgpu.mfma %140 * %175 + %173 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%177 = vector.extract %115[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%178 = vector.shape_cast %177 : vector<1x1x4x1xf16> to vector<4xf16>
%179 = amdgpu.mfma %145 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%180 = vector.extract %115[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%181 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16>
%182 = amdgpu.mfma %150 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%183 = vector.extract %115[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%184 = vector.shape_cast %183 : vector<1x1x4x1xf16> to vector<4xf16>
%185 = amdgpu.mfma %155 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x4x1xf32>
%187 = vector.insert %186, %159 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%188 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%189 = vector.extract %81[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%190 = vector.shape_cast %189 : vector<1x1x1x4xf16> to vector<4xf16>
%191 = vector.shape_cast %188 : vector<1x1x4x1xf32> to vector<4xf32>
%192 = amdgpu.mfma %190 * %120 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%193 = vector.extract %81[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%194 = vector.shape_cast %193 : vector<1x1x1x4xf16> to vector<4xf16>
%195 = amdgpu.mfma %194 * %126 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%196 = vector.extract %81[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%197 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16>
%198 = amdgpu.mfma %197 * %131 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%199 = vector.extract %81[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%200 = vector.shape_cast %199 : vector<1x1x1x4xf16> to vector<4xf16>
%201 = amdgpu.mfma %200 * %136 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%202 = vector.extract %81[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%203 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16>
%204 = amdgpu.mfma %203 * %141 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%205 = vector.extract %81[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%206 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16>
%207 = amdgpu.mfma %206 * %146 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%208 = vector.extract %81[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%209 = vector.shape_cast %208 : vector<1x1x1x4xf16> to vector<4xf16>
%210 = amdgpu.mfma %209 * %151 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%211 = vector.extract %81[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%212 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16>
%213 = amdgpu.mfma %212 * %156 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%214 = vector.shape_cast %213 : vector<4xf32> to vector<1x1x4x1xf32>
%215 = vector.insert %214, %187 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%216 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%217 = vector.shape_cast %216 : vector<1x1x4x1xf32> to vector<4xf32>
%218 = amdgpu.mfma %190 * %162 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%219 = amdgpu.mfma %194 * %166 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%220 = amdgpu.mfma %197 * %169 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%221 = amdgpu.mfma %200 * %172 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%222 = amdgpu.mfma %203 * %175 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%223 = amdgpu.mfma %206 * %178 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%224 = amdgpu.mfma %209 * %181 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%225 = amdgpu.mfma %212 * %184 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32>
%227 = vector.insert %226, %215 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
scf.yield %227 : vector<2x2x1x1x4x1xf32>
}
%4 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16)>()[%arg0, %thread_id_x]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16)>()[%arg1, %thread_id_x]
vector.transfer_write %4, %2[%5, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%7 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%8 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16 + 16)>()[%arg1, %thread_id_x]
vector.transfer_write %7, %2[%5, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%9 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%10 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16 + 16)>()[%arg0, %thread_id_x]
vector.transfer_write %9, %2[%10, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%11 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %11, %2[%10, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %subview_4 : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
memref.dealloc %subview : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
return
}
// -----// IR Dump After LLVMGPULowerExecutableTargetPass (iree-llvmgpu-lower-executable-target) //----- //
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16>
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16>
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32>
%cst_2 = arith.constant 0.000000e+00 : f16
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%thread_id_x = gpu.thread_id x
%alloc = memref.alloc() : memref<128x68xf16, #gpu.address_space<workgroup>>
%subview = memref.subview %alloc[0, 0] [128, 64] [1, 1] : memref<128x68xf16, #gpu.address_space<workgroup>> to memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<64x132xf16, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %alloc_3[0, 0] [64, 128] [1, 1] : memref<64x132xf16, #gpu.address_space<workgroup>> to memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) {
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) {
gpu.barrier
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x]
%13 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x]
%14 = vector.transfer_read %0[%12, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x]
%16 = vector.transfer_read %0[%15, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x]
%18 = vector.transfer_read %0[%17, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x]
%20 = vector.transfer_read %0[%19, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x]
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x]
%23 = vector.transfer_read %1[%21, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x]
%25 = vector.transfer_read %1[%24, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x]
%27 = vector.transfer_read %1[%26, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x]
%29 = vector.transfer_read %1[%28, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16>
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x]
%31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x]
vector.transfer_write %14, %alloc_3[%30, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x]
vector.transfer_write %16, %alloc_3[%32, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%33 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x]
vector.transfer_write %18, %alloc_3[%33, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x]
vector.transfer_write %20, %alloc_3[%34, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>>
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x]
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x]
vector.transfer_write %23, %alloc[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x]
vector.transfer_write %25, %alloc[%37, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x]
vector.transfer_write %27, %alloc[%38, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
%39 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x]
vector.transfer_write %29, %alloc[%39, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>>
gpu.barrier
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x]
%42 = vector.transfer_read %alloc_3[%40, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%43 = vector.insert_strided_slice %42, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x]
%45 = vector.transfer_read %alloc_3[%40, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%46 = vector.insert_strided_slice %45, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x]
%48 = vector.transfer_read %alloc_3[%40, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%49 = vector.insert_strided_slice %48, %46 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x]
%51 = vector.transfer_read %alloc_3[%40, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%52 = vector.insert_strided_slice %51, %49 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x]
%54 = vector.transfer_read %alloc_3[%40, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%55 = vector.insert_strided_slice %54, %52 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%56 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x]
%57 = vector.transfer_read %alloc_3[%40, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x]
%60 = vector.transfer_read %alloc_3[%40, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x]
%63 = vector.transfer_read %alloc_3[%40, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%66 = vector.transfer_read %alloc_3[%65, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%67 = vector.insert_strided_slice %66, %64 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%68 = vector.transfer_read %alloc_3[%65, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%69 = vector.insert_strided_slice %68, %67 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%70 = vector.transfer_read %alloc_3[%65, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%71 = vector.insert_strided_slice %70, %69 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%72 = vector.transfer_read %alloc_3[%65, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%73 = vector.insert_strided_slice %72, %71 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%74 = vector.transfer_read %alloc_3[%65, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%75 = vector.insert_strided_slice %74, %73 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%76 = vector.transfer_read %alloc_3[%65, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%77 = vector.insert_strided_slice %76, %75 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%78 = vector.transfer_read %alloc_3[%65, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%79 = vector.insert_strided_slice %78, %77 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%80 = vector.transfer_read %alloc_3[%65, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16>
%81 = vector.insert_strided_slice %80, %79 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16>
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x]
%83 = vector.transfer_read %alloc[%41, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%84 = vector.insert_strided_slice %83, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x]
%86 = vector.transfer_read %alloc[%41, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%87 = vector.insert_strided_slice %86, %84 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%88 = vector.transfer_read %alloc[%44, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%89 = vector.insert_strided_slice %88, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%90 = vector.transfer_read %alloc[%44, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%91 = vector.insert_strided_slice %90, %89 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%92 = vector.transfer_read %alloc[%47, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%93 = vector.insert_strided_slice %92, %91 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%94 = vector.transfer_read %alloc[%47, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%95 = vector.insert_strided_slice %94, %93 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%96 = vector.transfer_read %alloc[%50, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%97 = vector.insert_strided_slice %96, %95 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%98 = vector.transfer_read %alloc[%50, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%99 = vector.insert_strided_slice %98, %97 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%100 = vector.transfer_read %alloc[%53, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%101 = vector.insert_strided_slice %100, %99 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%102 = vector.transfer_read %alloc[%53, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%103 = vector.insert_strided_slice %102, %101 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%104 = vector.transfer_read %alloc[%56, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%105 = vector.insert_strided_slice %104, %103 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%106 = vector.transfer_read %alloc[%56, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%107 = vector.insert_strided_slice %106, %105 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%108 = vector.transfer_read %alloc[%59, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%109 = vector.insert_strided_slice %108, %107 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%110 = vector.transfer_read %alloc[%59, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%111 = vector.insert_strided_slice %110, %109 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%112 = vector.transfer_read %alloc[%62, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%113 = vector.insert_strided_slice %112, %111 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%114 = vector.transfer_read %alloc[%62, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16>
%115 = vector.insert_strided_slice %114, %113 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16>
%116 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%117 = vector.extract %81[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%118 = vector.extract %115[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%119 = vector.shape_cast %117 : vector<1x1x1x4xf16> to vector<4xf16>
%120 = vector.shape_cast %118 : vector<1x1x4x1xf16> to vector<4xf16>
%121 = vector.shape_cast %116 : vector<1x1x4x1xf32> to vector<4xf32>
%122 = amdgpu.mfma %119 * %120 + %121 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%123 = vector.extract %81[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%124 = vector.extract %115[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%125 = vector.shape_cast %123 : vector<1x1x1x4xf16> to vector<4xf16>
%126 = vector.shape_cast %124 : vector<1x1x4x1xf16> to vector<4xf16>
%127 = amdgpu.mfma %125 * %126 + %122 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%128 = vector.extract %81[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%129 = vector.extract %115[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%130 = vector.shape_cast %128 : vector<1x1x1x4xf16> to vector<4xf16>
%131 = vector.shape_cast %129 : vector<1x1x4x1xf16> to vector<4xf16>
%132 = amdgpu.mfma %130 * %131 + %127 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%133 = vector.extract %81[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%134 = vector.extract %115[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%135 = vector.shape_cast %133 : vector<1x1x1x4xf16> to vector<4xf16>
%136 = vector.shape_cast %134 : vector<1x1x4x1xf16> to vector<4xf16>
%137 = amdgpu.mfma %135 * %136 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%138 = vector.extract %81[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%139 = vector.extract %115[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%140 = vector.shape_cast %138 : vector<1x1x1x4xf16> to vector<4xf16>
%141 = vector.shape_cast %139 : vector<1x1x4x1xf16> to vector<4xf16>
%142 = amdgpu.mfma %140 * %141 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%143 = vector.extract %81[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%144 = vector.extract %115[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%145 = vector.shape_cast %143 : vector<1x1x1x4xf16> to vector<4xf16>
%146 = vector.shape_cast %144 : vector<1x1x4x1xf16> to vector<4xf16>
%147 = amdgpu.mfma %145 * %146 + %142 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%148 = vector.extract %81[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%149 = vector.extract %115[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%150 = vector.shape_cast %148 : vector<1x1x1x4xf16> to vector<4xf16>
%151 = vector.shape_cast %149 : vector<1x1x4x1xf16> to vector<4xf16>
%152 = amdgpu.mfma %150 * %151 + %147 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%153 = vector.extract %81[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%154 = vector.extract %115[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%155 = vector.shape_cast %153 : vector<1x1x1x4xf16> to vector<4xf16>
%156 = vector.shape_cast %154 : vector<1x1x4x1xf16> to vector<4xf16>
%157 = amdgpu.mfma %155 * %156 + %152 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%158 = vector.shape_cast %157 : vector<4xf32> to vector<1x1x4x1xf32>
%159 = vector.insert %158, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%160 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%161 = vector.extract %115[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%162 = vector.shape_cast %161 : vector<1x1x4x1xf16> to vector<4xf16>
%163 = vector.shape_cast %160 : vector<1x1x4x1xf32> to vector<4xf32>
%164 = amdgpu.mfma %119 * %162 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%165 = vector.extract %115[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%166 = vector.shape_cast %165 : vector<1x1x4x1xf16> to vector<4xf16>
%167 = amdgpu.mfma %125 * %166 + %164 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%168 = vector.extract %115[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%169 = vector.shape_cast %168 : vector<1x1x4x1xf16> to vector<4xf16>
%170 = amdgpu.mfma %130 * %169 + %167 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%171 = vector.extract %115[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%172 = vector.shape_cast %171 : vector<1x1x4x1xf16> to vector<4xf16>
%173 = amdgpu.mfma %135 * %172 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%174 = vector.extract %115[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%175 = vector.shape_cast %174 : vector<1x1x4x1xf16> to vector<4xf16>
%176 = amdgpu.mfma %140 * %175 + %173 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%177 = vector.extract %115[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%178 = vector.shape_cast %177 : vector<1x1x4x1xf16> to vector<4xf16>
%179 = amdgpu.mfma %145 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%180 = vector.extract %115[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%181 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16>
%182 = amdgpu.mfma %150 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%183 = vector.extract %115[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16>
%184 = vector.shape_cast %183 : vector<1x1x4x1xf16> to vector<4xf16>
%185 = amdgpu.mfma %155 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x4x1xf32>
%187 = vector.insert %186, %159 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%188 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%189 = vector.extract %81[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%190 = vector.shape_cast %189 : vector<1x1x1x4xf16> to vector<4xf16>
%191 = vector.shape_cast %188 : vector<1x1x4x1xf32> to vector<4xf32>
%192 = amdgpu.mfma %190 * %120 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%193 = vector.extract %81[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%194 = vector.shape_cast %193 : vector<1x1x1x4xf16> to vector<4xf16>
%195 = amdgpu.mfma %194 * %126 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%196 = vector.extract %81[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%197 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16>
%198 = amdgpu.mfma %197 * %131 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%199 = vector.extract %81[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%200 = vector.shape_cast %199 : vector<1x1x1x4xf16> to vector<4xf16>
%201 = amdgpu.mfma %200 * %136 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%202 = vector.extract %81[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%203 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16>
%204 = amdgpu.mfma %203 * %141 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%205 = vector.extract %81[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%206 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16>
%207 = amdgpu.mfma %206 * %146 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%208 = vector.extract %81[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%209 = vector.shape_cast %208 : vector<1x1x1x4xf16> to vector<4xf16>
%210 = amdgpu.mfma %209 * %151 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%211 = vector.extract %81[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16>
%212 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16>
%213 = amdgpu.mfma %212 * %156 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%214 = vector.shape_cast %213 : vector<4xf32> to vector<1x1x4x1xf32>
%215 = vector.insert %214, %187 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
%216 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32>
%217 = vector.shape_cast %216 : vector<1x1x4x1xf32> to vector<4xf32>
%218 = amdgpu.mfma %190 * %162 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%219 = amdgpu.mfma %194 * %166 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%220 = amdgpu.mfma %197 * %169 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%221 = amdgpu.mfma %200 * %172 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%222 = amdgpu.mfma %203 * %175 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%223 = amdgpu.mfma %206 * %178 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%224 = amdgpu.mfma %209 * %181 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%225 = amdgpu.mfma %212 * %184 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32>
%227 = vector.insert %226, %215 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32>
scf.yield %227 : vector<2x2x1x1x4x1xf32>
}
%4 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16)>()[%arg0, %thread_id_x]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16)>()[%arg1, %thread_id_x]
vector.transfer_write %4, %2[%5, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%7 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%8 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16 + 16)>()[%arg1, %thread_id_x]
vector.transfer_write %7, %2[%5, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%9 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
%10 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16 + 16)>()[%arg0, %thread_id_x]
vector.transfer_write %9, %2[%10, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
%11 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32>
vector.transfer_write %11, %2[%10, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.dealloc %subview_4 : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>
memref.dealloc %subview : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>
return
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment