Created
January 22, 2025 16:16
-
-
Save pashu123/2885238f8da6d4c39d2061717bb1b084 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<256x256xf16> to tensor<64x256xf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x256xf16> to tensor<256x64xf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xf16>, tensor<256x64xf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<256x256xf16> to tensor<64x256xf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x256xf16> to tensor<256x64xf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xf16>, tensor<256x64xf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<256x256xf16> to tensor<64x256xf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x256xf16> to tensor<256x64xf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xf16>, tensor<256x64xf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After ConvertAttentionToOnlineAttentionPass (iree-linalg-ext-convert-attention-to-online-attention) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<256x256xf16> to tensor<64x256xf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x256xf16> to tensor<256x64xf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xf16>, tensor<256x64xf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<256x256xf16> to tensor<64x256xf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x256xf16> to tensor<256x64xf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xf16>, tensor<256x64xf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<256x256xf16> to tensor<64x256xf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x256xf16> to tensor<256x64xf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xf16>, tensor<256x64xf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After GPUPromoteMatmulOperandsPass (iree-codegen-gpu-promote-matmul-operands) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<256x256xf16> to tensor<64x256xf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x256xf16> to tensor<256x64xf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = tensor.empty() : tensor<64x256xf16> | |
%9 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<64x256xf16>) outs(%8 : tensor<64x256xf16>) -> tensor<64x256xf16> | |
%10 = tensor.empty() : tensor<256x64xf16> | |
%11 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<256x64xf16>) outs(%10 : tensor<256x64xf16>) -> tensor<256x64xf16> | |
%12 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%9, %11 : tensor<64x256xf16>, tensor<256x64xf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %12 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%9 = tensor.empty() : tensor<64x128xf16> | |
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%11 = tensor.empty() : tensor<128x64xf16> | |
%12 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%11 : tensor<128x64xf16>) -> tensor<128x64xf16> | |
%13 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%10, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%arg4 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.yield %13 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After LoopCoalescing (affine-loop-coalescing) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%9 = tensor.empty() : tensor<64x128xf16> | |
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%11 = tensor.empty() : tensor<128x64xf16> | |
%12 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%11 : tensor<128x64xf16>) -> tensor<128x64xf16> | |
%13 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%10, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%arg4 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.yield %13 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%9 = tensor.empty() : tensor<64x128xf16> | |
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%11 = tensor.empty() : tensor<128x64xf16> | |
%12 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%11 : tensor<128x64xf16>) -> tensor<128x64xf16> | |
%13 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%10, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%arg4 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.yield %13 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%9 = tensor.empty() : tensor<64x128xf16> | |
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%11 = tensor.empty() : tensor<128x64xf16> | |
%12 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%11 : tensor<128x64xf16>) -> tensor<128x64xf16> | |
%13 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%10, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%arg4 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.yield %13 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After DecomposeAttentionPass (iree-linalg-ext-decompose-attention) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%9 = tensor.empty() : tensor<64x128xf16> | |
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%11 = tensor.empty() : tensor<128x64xf16> | |
%12 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%11 : tensor<128x64xf16>) -> tensor<128x64xf16> | |
%13 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%10, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%arg4 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.yield %13 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%9 = tensor.empty() : tensor<64x128xf16> | |
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%11 = tensor.empty() : tensor<128x64xf16> | |
%12 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%11 : tensor<128x64xf16>) -> tensor<128x64xf16> | |
%13 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%10, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%arg4 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.yield %13 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%9 = tensor.empty() : tensor<64x128xf16> | |
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%11 = tensor.empty() : tensor<128x64xf16> | |
%12 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%11 : tensor<128x64xf16>) -> tensor<128x64xf16> | |
%13 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%10, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%arg4 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.yield %13 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMGPUConfigureTensorLayoutsPass (iree-llvmgpu-configure-tensor-layouts) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%9 = tensor.empty() : tensor<64x128xf16> | |
%10 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%9 : tensor<64x128xf16>) -> tensor<64x128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%12 = tensor.empty() : tensor<128x64xf16> | |
%13 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%12 : tensor<128x64xf16>) -> tensor<128x64xf16> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16> | |
%15 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16> | |
%16 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16> | |
%17 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%18 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%15, %16 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%17 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
scf.yield %19 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = tensor.empty() : tensor<64x128xf16> | |
%9 = tensor.empty() : tensor<128x64xf16> | |
%10 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%11 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%8 : tensor<64x128xf16>) -> tensor<64x128xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%13 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%9 : tensor<128x64xf16>) -> tensor<128x64xf16> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16> | |
%15 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16> | |
%16 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16> | |
%17 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%18 = linalg.matmul {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 128], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>} ins(%15, %16 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%17 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
scf.yield %19 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After LinalgGeneralizeNamedOpsPass (linalg-generalize-named-ops) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%8 = tensor.empty() : tensor<64x128xf16> | |
%9 = tensor.empty() : tensor<128x64xf16> | |
%10 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%8 : tensor<64x128xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<64x128xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%9 : tensor<128x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<128x64xf16> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16> | |
%15 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16> | |
%16 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16> | |
%17 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%15, %16 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%17 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_2: f16, %out: f32): | |
%20 = arith.extf %in : f16 to f32 | |
%21 = arith.extf %in_2 : f16 to f32 | |
%22 = arith.mulf %20, %21 : f32 | |
%23 = arith.addf %out, %22 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<64x64xf32> | |
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
scf.yield %19 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After VectorExtFoldUnitExtentDimsPass (iree-vector-ext-fold-unit-extent-dims) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%8 = tensor.empty() : tensor<64x128xf16> | |
%9 = tensor.empty() : tensor<128x64xf16> | |
%10 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%8 : tensor<64x128xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<64x128xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%9 : tensor<128x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<128x64xf16> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16> | |
%15 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16> | |
%16 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16> | |
%17 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%15, %16 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%17 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_2: f16, %out: f32): | |
%20 = arith.extf %in : f16 to f32 | |
%21 = arith.extf %in_2 : f16 to f32 | |
%22 = arith.mulf %20, %21 : f32 | |
%23 = arith.addf %out, %22 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<64x64xf32> | |
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
scf.yield %19 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After LinalgFoldUnitExtentDimsPass (linalg-fold-unit-extent-dims) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%8 = tensor.empty() : tensor<64x128xf16> | |
%9 = tensor.empty() : tensor<128x64xf16> | |
%10 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_0 : tensor<64x128xf16>) outs(%8 : tensor<64x128xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<64x128xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice_1 : tensor<128x64xf16>) outs(%9 : tensor<128x64xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<128x64xf16> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16> | |
%15 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16> | |
%16 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16> | |
%17 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%15, %16 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%17 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_2: f16, %out: f32): | |
%20 = arith.extf %in : f16 to f32 | |
%21 = arith.extf %in_2 : f16 to f32 | |
%22 = arith.mulf %20, %21 : f32 | |
%23 = arith.addf %out, %22 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<64x64xf32> | |
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
scf.yield %19 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%9 = iree_vector_ext.to_layout %extracted_slice_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%10 = iree_vector_ext.to_layout %extracted_slice_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16> | |
%11 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16> | |
%12 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16> | |
%13 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%13 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_2: f16, %out: f32): | |
%16 = arith.extf %in : f16 to f32 | |
%17 = arith.extf %in_2 : f16 to f32 | |
%18 = arith.mulf %16, %17 : f32 | |
%19 = arith.addf %out, %18 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<64x64xf32> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
scf.yield %15 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%9 = iree_vector_ext.to_layout %extracted_slice_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%10 = iree_vector_ext.to_layout %extracted_slice_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16> | |
%11 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16> | |
%12 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16> | |
%13 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%13 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_2: f16, %out: f32): | |
%16 = arith.extf %in : f16 to f32 | |
%17 = arith.extf %in_2 : f16 to f32 | |
%18 = arith.mulf %16, %17 : f32 | |
%19 = arith.addf %out, %18 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<64x64xf32> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
scf.yield %15 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%9 = iree_vector_ext.to_layout %extracted_slice_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16> | |
%10 = iree_vector_ext.to_layout %extracted_slice_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16> | |
%11 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16> | |
%12 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16> | |
%13 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%13 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_2: f16, %out: f32): | |
%16 = arith.extf %in : f16 to f32 | |
%17 = arith.extf %in_2 : f16 to f32 | |
%18 = arith.mulf %16, %17 : f32 | |
%19 = arith.addf %out, %18 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<64x64xf32> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
scf.yield %15 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%9 = iree_vector_ext.to_layout %extracted_slice_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16> | |
%10 = iree_vector_ext.to_layout %extracted_slice_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16> | |
%11 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16> | |
%12 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16> | |
%13 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%13 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_2: f16, %out: f32): | |
%16 = arith.extf %in : f16 to f32 | |
%17 = arith.extf %in_2 : f16 to f32 | |
%18 = arith.mulf %16, %17 : f32 | |
%19 = arith.addf %out, %18 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<64x64xf32> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
scf.yield %15 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After DecomposeIm2colPass (iree-linalg-ext-decompose-im2col) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f32) outs(%extracted_slice : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%9 = iree_vector_ext.to_layout %extracted_slice_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<64x128xf16> | |
%10 = iree_vector_ext.to_layout %extracted_slice_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : tensor<128x64xf16> | |
%11 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<64x128xf16> | |
%12 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : tensor<128x64xf16> | |
%13 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%13 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_2: f16, %out: f32): | |
%16 = arith.extf %in : f16 to f32 | |
%17 = arith.extf %in_2 : f16 to f32 | |
%18 = arith.mulf %16, %17 : f32 | |
%19 = arith.addf %out, %18 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<64x64xf32> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : tensor<64x64xf32> | |
scf.yield %15 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After VectorizeIREEVectorExtOpsPass (iree-vector-ext-vectorize-ops) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_0 : f32) outs(%extracted_slice : tensor<64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_1 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%extracted_slice_2 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%9 = vector.transfer_read %extracted_slice_1[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x128xf16>, vector<64x128xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%11 = vector.transfer_read %extracted_slice_2[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<128x64xf16>, vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%13 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x128xf16> | |
%14 = tensor.empty() : tensor<64x128xf16> | |
%15 = vector.transfer_write %13, %14[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, tensor<64x128xf16> | |
%16 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<128x64xf16> | |
%17 = tensor.empty() : tensor<128x64xf16> | |
%18 = vector.transfer_write %16, %17[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, tensor<128x64xf16> | |
%19 = vector.transfer_read %arg4[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%20 = iree_vector_ext.to_layout %19 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%21 = tensor.empty() : tensor<64x64xf32> | |
%22 = vector.transfer_write %20, %21[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%15, %18 : tensor<64x128xf16>, tensor<128x64xf16>) outs(%22 : tensor<64x64xf32>) { | |
^bb0(%in: f16, %in_3: f16, %out: f32): | |
%28 = arith.extf %in : f16 to f32 | |
%29 = arith.extf %in_3 : f16 to f32 | |
%30 = arith.mulf %28, %29 : f32 | |
%31 = arith.addf %out, %30 : f32 | |
linalg.yield %31 : f32 | |
} -> tensor<64x64xf32> | |
%24 = vector.transfer_read %23[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%26 = tensor.empty() : tensor<64x64xf32> | |
%27 = vector.transfer_write %25, %26[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.yield %27 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = vector.transfer_write %cst, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_2 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%extracted_slice_3 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%9 = vector.transfer_read %extracted_slice_2[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16>, vector<64x128xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%11 = vector.transfer_read %extracted_slice_3[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16>, vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%13 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x128xf16> | |
%14 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<128x64xf16> | |
%15 = vector.transfer_read %arg4[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %16 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%19 = tensor.empty() : tensor<64x64xf32> | |
%20 = vector.transfer_write %18, %19[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.yield %20 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = vector.transfer_write %cst, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_2 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%extracted_slice_3 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%9 = vector.transfer_read %extracted_slice_2[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16>, vector<64x128xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%11 = vector.transfer_read %extracted_slice_3[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16>, vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%13 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x128xf16> | |
%14 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<128x64xf16> | |
%15 = vector.transfer_read %arg4[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %16 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%19 = tensor.empty() : tensor<64x64xf32> | |
%20 = vector.transfer_write %18, %19[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.yield %20 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = vector.transfer_write %cst, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %7) -> (tensor<64x64xf32>) { | |
%extracted_slice_2 = tensor.extract_slice %3[%arg0, %arg3] [64, 128] [1, 1] : tensor<256x256xf16> to tensor<64x128xf16> | |
%extracted_slice_3 = tensor.extract_slice %4[%arg3, %arg1] [128, 64] [1, 1] : tensor<256x256xf16> to tensor<128x64xf16> | |
%9 = vector.transfer_read %extracted_slice_2[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16>, vector<64x128xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%11 = vector.transfer_read %extracted_slice_3[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16>, vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%13 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x128xf16> | |
%14 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<128x64xf16> | |
%15 = vector.transfer_read %arg4[%c0, %c0], %cst_1 {in_bounds = [true, true]} : tensor<64x64xf32>, vector<64x64xf32> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %16 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%19 = tensor.empty() : tensor<64x64xf32> | |
%20 = vector.transfer_write %18, %19[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.yield %20 : tensor<64x64xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%7 = tensor.empty() : tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) { | |
%10 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%12 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%14 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x128xf16> | |
%15 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<128x64xf16> | |
%16 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %18 : vector<64x64xf32> | |
} | |
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%7 = tensor.empty() : tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) { | |
%10 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%12 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%14 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x128xf16> | |
%15 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<128x64xf16> | |
%16 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %18 : vector<64x64xf32> | |
} | |
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%7 = tensor.empty() : tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) { | |
%10 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%12 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%14 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<64x128xf16> | |
%15 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, shared_memory_conversion} : vector<128x64xf16> | |
%16 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %18 : vector<64x64xf32> | |
} | |
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After GPUVectorAllocPass (iree-codegen-gpu-vector-alloc) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%7 = tensor.empty() : tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
gpu.barrier | |
%10 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%12 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%c0_1 = arith.constant 0 : index | |
%15 = vector.transfer_write %11, %14[%c0_1, %c0_1] {in_bounds = [true, true]} : vector<64x128xf16>, tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%16 = iree_gpu.value_barrier %15 : tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%c0_2 = arith.constant 0 : index | |
%cst_3 = arith.constant 0.000000e+00 : f16 | |
%17 = vector.transfer_read %16[%c0_2, %c0_2], %cst_3 {in_bounds = [true, true]} : tensor<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%c0_4 = arith.constant 0 : index | |
%20 = vector.transfer_write %13, %19[%c0_4, %c0_4] {in_bounds = [true, true]} : vector<128x64xf16>, tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%21 = iree_gpu.value_barrier %20 : tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%c0_5 = arith.constant 0 : index | |
%cst_6 = arith.constant 0.000000e+00 : f16 | |
%22 = vector.transfer_read %21[%c0_5, %c0_5], %cst_6 {in_bounds = [true, true]} : tensor<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%24 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%25 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %23, %24 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %26 : vector<64x64xf32> | |
} | |
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%7 = tensor.empty() : tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%10 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%12 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%15 = vector.transfer_write %11, %14[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%16 = iree_gpu.value_barrier %15 : tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%17 = vector.transfer_read %16[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%20 = vector.transfer_write %13, %19[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%21 = iree_gpu.value_barrier %20 : tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%22 = vector.transfer_read %21[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%24 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%25 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %23, %24 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %26 : vector<64x64xf32> | |
} | |
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%7 = tensor.empty() : tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%10 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%12 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%15 = vector.transfer_write %11, %14[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%16 = iree_gpu.value_barrier %15 : tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%17 = vector.transfer_read %16[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%18 = iree_vector_ext.to_layout %17 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%20 = vector.transfer_write %13, %19[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%21 = iree_gpu.value_barrier %20 : tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%22 = vector.transfer_read %21[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%24 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%25 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %23, %24 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %26 : vector<64x64xf32> | |
} | |
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After GPUCombineValueBarriersPass (iree-codegen-gpu-combine-value-barriers) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = tensor.empty() : tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%7 = tensor.empty() : tensor<64x64xf32> | |
%8 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%10 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%12 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%15 = vector.transfer_write %11, %14[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%17 = vector.transfer_write %13, %16[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%18:2 = iree_gpu.value_barrier %15, %17 : tensor<64x128xf16, #gpu.address_space<workgroup>>, tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%19 = vector.transfer_read %18#0[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%20 = iree_vector_ext.to_layout %19 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%21 = vector.transfer_read %18#1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%22 = iree_vector_ext.to_layout %21 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%23 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %20, %22, %23 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%25 = iree_vector_ext.to_layout %24 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %25 : vector<64x64xf32> | |
} | |
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> -> tensor<256x256xf32> | |
%6 = tensor.empty() : tensor<256x256xf32> | |
%7 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%8 = tensor.empty() : tensor<64x64xf32> | |
%9 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%11 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%13 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%16 = vector.transfer_write %12, %15[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%17 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%18 = vector.transfer_write %14, %17[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%19:2 = iree_gpu.value_barrier %16, %18 : tensor<64x128xf16, #gpu.address_space<workgroup>>, tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%20 = vector.transfer_read %19#0[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%21 = iree_vector_ext.to_layout %20 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%22 = vector.transfer_read %19#1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%23 = iree_vector_ext.to_layout %22 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%24 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%25 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %21, %23, %24 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%26 = iree_vector_ext.to_layout %25 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %26 : vector<64x64xf32> | |
} | |
%10 = vector.transfer_write %9, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16> | |
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> -> tensor<256x256xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<256x256xf32>) { | |
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> | |
%7 = scf.for %arg3 = %c0 to %c256 step %c128 iter_args(%arg4 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%9 = vector.transfer_read %3[%arg0, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<64x128xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%11 = vector.transfer_read %4[%arg3, %arg1], %cst_0 {in_bounds = [true, true]} : tensor<256x256xf16>, vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%14 = vector.transfer_write %10, %13[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, tensor<64x128xf16, #gpu.address_space<workgroup>> | |
%15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%16 = vector.transfer_write %12, %15[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%17:2 = iree_gpu.value_barrier %14, %16 : tensor<64x128xf16, #gpu.address_space<workgroup>>, tensor<128x64xf16, #gpu.address_space<workgroup>> | |
%18 = vector.transfer_read %17#0[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%20 = vector.transfer_read %17#1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%21 = iree_vector_ext.to_layout %20 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%22 = iree_vector_ext.to_layout %arg4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%23 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %19, %21, %22 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %24 : vector<64x64xf32> | |
} | |
%8 = vector.transfer_write %7, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>> | |
return | |
} | |
// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %7, %alloc_2[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%10 = vector.transfer_read %alloc_2[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %14 : vector<64x64xf32> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview_1 : memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.copy %2, %2 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %7, %alloc_2[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%10 = vector.transfer_read %alloc_2[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %14 : vector<64x64xf32> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview_1 : memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.copy %2, %2 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %7, %alloc_2[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%10 = vector.transfer_read %alloc_2[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %14 : vector<64x64xf32> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview_1 : memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%alloc_1 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %7, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%10 = vector.transfer_read %alloc_1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %14 : vector<64x64xf32> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview : memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%alloc_1 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %7, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%10 = vector.transfer_read %alloc_1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %14 : vector<64x64xf32> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%alloc_1 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %7, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%10 = vector.transfer_read %alloc_1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %14 : vector<64x64xf32> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%alloc_1 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %7, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%10 = vector.transfer_read %alloc_1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %14 : vector<64x64xf32> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%alloc_1 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %7, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%10 = vector.transfer_read %alloc_1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %14 : vector<64x64xf32> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%alloc_1 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %7, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%10 = vector.transfer_read %alloc_1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %14 : vector<64x64xf32> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_0 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
%alloc = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %5, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%alloc_1 = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %7, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%8 = vector.transfer_read %alloc[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%10 = vector.transfer_read %alloc_1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %14 : vector<64x64xf32> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After HoistStaticallyBoundAllocationsPass (iree-codegen-hoist-statically-bound-allocations) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%alloc = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
%alloc_0 = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
%cst = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%cst_1 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%4 = vector.transfer_read %0[%arg0, %arg2], %cst_1 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%6 = vector.transfer_read %1[%arg2, %arg1], %cst_1 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
vector.transfer_write %5, %alloc_0[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %7, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%8 = vector.transfer_read %alloc_0[%c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%10 = vector.transfer_read %alloc[%c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %14 : vector<64x64xf32> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.dealloc %alloc_0 : memref<64x128xf16, #gpu.address_space<workgroup>> | |
memref.dealloc %alloc : memref<128x64xf16, #gpu.address_space<workgroup>> | |
return | |
} | |
// -----// IR Dump After LLVMGPUCastTypeToFitMMAPass (iree-llvmgpu-cast-type-to-fit-mma) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%c0 = arith.constant 0 : index | |
%c256 = arith.constant 256 : index | |
%c128 = arith.constant 128 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<64x64xf32> | |
%alloc = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
%alloc_1 = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_0) -> (vector<64x64xf32>) { | |
gpu.barrier | |
%4 = vector.transfer_read %0[%arg0, %arg2], %cst {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<64x128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<64x128xf16> | |
%6 = vector.transfer_read %1[%arg2, %arg1], %cst {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<128x64xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [32, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>) : vector<128x64xf16> | |
vector.transfer_write %5, %alloc_1[%c0, %c0] {in_bounds = [true, true]} : vector<64x128xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
vector.transfer_write %7, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<128x64xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%8 = vector.transfer_read %alloc_1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<64x128xf16> | |
%9 = iree_vector_ext.to_layout %8 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 8], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [2, 0], thread_strides = [1, 16]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16> | |
%10 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<128x64xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 2], batch_tile = [8, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [0, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<128x64xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %9, %11, %12 {iree.amdgpu.mma = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x128xf16>, vector<128x64xf16> into vector<64x64xf32> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [2, 2], batch_tile = [2, 2], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [4, 1], subgroup_strides = [2, 1], thread_strides = [16, 1]>) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32> | |
scf.yield %14 : vector<64x64xf32> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.dealloc %alloc_1 : memref<64x128xf16, #gpu.address_space<workgroup>> | |
memref.dealloc %alloc : memref<128x64xf16, #gpu.address_space<workgroup>> | |
return | |
} | |
// -----// IR Dump After LLVMGPUVectorDistributePass (iree-llvmgpu-vector-distribute) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%thread_id_z = gpu.thread_id z | |
%thread_id_y = gpu.thread_id y | |
%thread_id_x = gpu.thread_id x | |
%0 = affine.linearize_index disjoint [%thread_id_z, %thread_id_y, %thread_id_x] by (1, 1, 256) : index | |
%alloc = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
%alloc_3 = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %3, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %3[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%4 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) { | |
gpu.barrier | |
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %0] | |
%18 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %0] | |
%19 = vector.transfer_read %1[%17, %18], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%20 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %0] | |
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %0] | |
%22 = vector.transfer_read %1[%20, %21], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %0] | |
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %0] | |
%25 = vector.transfer_read %1[%23, %24], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %0] | |
%27 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %0] | |
%28 = vector.transfer_read %1[%26, %27], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%29 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %0] | |
%30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %0] | |
%31 = vector.transfer_read %2[%29, %30], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%32 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %0] | |
%33 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %0] | |
%34 = vector.transfer_read %2[%32, %33], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%35 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %0] | |
%36 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %0] | |
%37 = vector.transfer_read %2[%35, %36], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%38 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %0] | |
%39 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %0] | |
%40 = vector.transfer_read %2[%38, %39], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%0] | |
%42 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%0] | |
vector.transfer_write %19, %alloc_3[%41, %42] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%43 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%0] | |
%44 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%0] | |
vector.transfer_write %22, %alloc_3[%43, %44] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%45 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%0] | |
%46 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%0] | |
vector.transfer_write %25, %alloc_3[%45, %46] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%0] | |
%48 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%0] | |
vector.transfer_write %28, %alloc_3[%47, %48] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%49 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%0] | |
%50 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
vector.transfer_write %31, %alloc[%49, %50] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
%51 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%0] | |
%52 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
vector.transfer_write %34, %alloc[%51, %52] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%0] | |
%54 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
vector.transfer_write %37, %alloc[%53, %54] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
%55 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%0] | |
%56 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%0] | |
vector.transfer_write %40, %alloc[%55, %56] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%57 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%58 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%59 = vector.transfer_read %alloc_3[%57, %58], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%60 = vector.insert_strided_slice %59, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%61 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%63 = vector.transfer_read %alloc_3[%61, %62], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%64 = vector.insert_strided_slice %63, %60 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%66 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%67 = vector.transfer_read %alloc_3[%65, %66], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%68 = vector.insert_strided_slice %67, %64 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%69 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%70 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%71 = vector.transfer_read %alloc_3[%69, %70], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%72 = vector.insert_strided_slice %71, %68 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%73 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%74 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%0] | |
%75 = vector.transfer_read %alloc_3[%73, %74], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%76 = vector.insert_strided_slice %75, %72 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%77 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%78 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%0] | |
%79 = vector.transfer_read %alloc_3[%77, %78], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%80 = vector.insert_strided_slice %79, %76 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%81 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%82 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%0] | |
%83 = vector.transfer_read %alloc_3[%81, %82], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%84 = vector.insert_strided_slice %83, %80 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%86 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%0] | |
%87 = vector.transfer_read %alloc_3[%85, %86], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%88 = vector.insert_strided_slice %87, %84 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%89 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%90 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%91 = vector.transfer_read %alloc_3[%89, %90], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%92 = vector.insert_strided_slice %91, %88 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%93 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%94 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%95 = vector.transfer_read %alloc_3[%93, %94], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%96 = vector.insert_strided_slice %95, %92 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%97 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%98 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%99 = vector.transfer_read %alloc_3[%97, %98], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%100 = vector.insert_strided_slice %99, %96 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%101 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%102 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%103 = vector.transfer_read %alloc_3[%101, %102], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%104 = vector.insert_strided_slice %103, %100 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%105 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%106 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%0] | |
%107 = vector.transfer_read %alloc_3[%105, %106], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%108 = vector.insert_strided_slice %107, %104 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%109 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%110 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%0] | |
%111 = vector.transfer_read %alloc_3[%109, %110], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%112 = vector.insert_strided_slice %111, %108 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%113 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%114 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%0] | |
%115 = vector.transfer_read %alloc_3[%113, %114], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%116 = vector.insert_strided_slice %115, %112 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%117 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%118 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%0] | |
%119 = vector.transfer_read %alloc_3[%117, %118], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%120 = vector.insert_strided_slice %119, %116 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%121 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%122 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%123 = vector.transfer_read %alloc[%121, %122], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%124 = vector.insert_strided_slice %123, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%125 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%126 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%127 = vector.transfer_read %alloc[%125, %126], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%128 = vector.insert_strided_slice %127, %124 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%129 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%130 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%131 = vector.transfer_read %alloc[%129, %130], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%132 = vector.insert_strided_slice %131, %128 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%133 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%134 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%135 = vector.transfer_read %alloc[%133, %134], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%136 = vector.insert_strided_slice %135, %132 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%137 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%138 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%139 = vector.transfer_read %alloc[%137, %138], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%140 = vector.insert_strided_slice %139, %136 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%141 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%0] | |
%142 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%143 = vector.transfer_read %alloc[%141, %142], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%144 = vector.insert_strided_slice %143, %140 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%145 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%146 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%147 = vector.transfer_read %alloc[%145, %146], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%148 = vector.insert_strided_slice %147, %144 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%149 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%0] | |
%150 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%151 = vector.transfer_read %alloc[%149, %150], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%152 = vector.insert_strided_slice %151, %148 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%153 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%0] | |
%154 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%155 = vector.transfer_read %alloc[%153, %154], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%156 = vector.insert_strided_slice %155, %152 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%157 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%0] | |
%158 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%159 = vector.transfer_read %alloc[%157, %158], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%160 = vector.insert_strided_slice %159, %156 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%161 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%0] | |
%162 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%163 = vector.transfer_read %alloc[%161, %162], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%164 = vector.insert_strided_slice %163, %160 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%165 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%0] | |
%166 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%167 = vector.transfer_read %alloc[%165, %166], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%168 = vector.insert_strided_slice %167, %164 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%169 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%0] | |
%170 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%171 = vector.transfer_read %alloc[%169, %170], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%172 = vector.insert_strided_slice %171, %168 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%173 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%0] | |
%174 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%175 = vector.transfer_read %alloc[%173, %174], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%176 = vector.insert_strided_slice %175, %172 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%177 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%0] | |
%178 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%179 = vector.transfer_read %alloc[%177, %178], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%180 = vector.insert_strided_slice %179, %176 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%181 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%0] | |
%182 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%183 = vector.transfer_read %alloc[%181, %182], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%184 = vector.insert_strided_slice %183, %180 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%185 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%186 = vector.extract %120[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%187 = vector.extract %184[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%188 = vector.shape_cast %186 : vector<1x1x1x4xf16> to vector<4xf16> | |
%189 = vector.shape_cast %187 : vector<1x1x4x1xf16> to vector<4xf16> | |
%190 = vector.shape_cast %185 : vector<1x1x4x1xf32> to vector<4xf32> | |
%191 = amdgpu.mfma %188 * %189 + %190 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%192 = vector.extract %120[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%193 = vector.extract %184[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%194 = vector.shape_cast %192 : vector<1x1x1x4xf16> to vector<4xf16> | |
%195 = vector.shape_cast %193 : vector<1x1x4x1xf16> to vector<4xf16> | |
%196 = amdgpu.mfma %194 * %195 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%197 = vector.extract %120[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%198 = vector.extract %184[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%199 = vector.shape_cast %197 : vector<1x1x1x4xf16> to vector<4xf16> | |
%200 = vector.shape_cast %198 : vector<1x1x4x1xf16> to vector<4xf16> | |
%201 = amdgpu.mfma %199 * %200 + %196 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%202 = vector.extract %120[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%203 = vector.extract %184[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%204 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16> | |
%205 = vector.shape_cast %203 : vector<1x1x4x1xf16> to vector<4xf16> | |
%206 = amdgpu.mfma %204 * %205 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%207 = vector.extract %120[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%208 = vector.extract %184[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%209 = vector.shape_cast %207 : vector<1x1x1x4xf16> to vector<4xf16> | |
%210 = vector.shape_cast %208 : vector<1x1x4x1xf16> to vector<4xf16> | |
%211 = amdgpu.mfma %209 * %210 + %206 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%212 = vector.extract %120[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%213 = vector.extract %184[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%214 = vector.shape_cast %212 : vector<1x1x1x4xf16> to vector<4xf16> | |
%215 = vector.shape_cast %213 : vector<1x1x4x1xf16> to vector<4xf16> | |
%216 = amdgpu.mfma %214 * %215 + %211 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%217 = vector.extract %120[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%218 = vector.extract %184[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%219 = vector.shape_cast %217 : vector<1x1x1x4xf16> to vector<4xf16> | |
%220 = vector.shape_cast %218 : vector<1x1x4x1xf16> to vector<4xf16> | |
%221 = amdgpu.mfma %219 * %220 + %216 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%222 = vector.extract %120[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%223 = vector.extract %184[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%224 = vector.shape_cast %222 : vector<1x1x1x4xf16> to vector<4xf16> | |
%225 = vector.shape_cast %223 : vector<1x1x4x1xf16> to vector<4xf16> | |
%226 = amdgpu.mfma %224 * %225 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%227 = vector.shape_cast %226 : vector<4xf32> to vector<1x1x4x1xf32> | |
%228 = vector.insert %227, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%229 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%230 = vector.extract %120[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%231 = vector.extract %184[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%232 = vector.shape_cast %230 : vector<1x1x1x4xf16> to vector<4xf16> | |
%233 = vector.shape_cast %231 : vector<1x1x4x1xf16> to vector<4xf16> | |
%234 = vector.shape_cast %229 : vector<1x1x4x1xf32> to vector<4xf32> | |
%235 = amdgpu.mfma %232 * %233 + %234 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%236 = vector.extract %120[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%237 = vector.extract %184[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%238 = vector.shape_cast %236 : vector<1x1x1x4xf16> to vector<4xf16> | |
%239 = vector.shape_cast %237 : vector<1x1x4x1xf16> to vector<4xf16> | |
%240 = amdgpu.mfma %238 * %239 + %235 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%241 = vector.extract %120[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%242 = vector.extract %184[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%243 = vector.shape_cast %241 : vector<1x1x1x4xf16> to vector<4xf16> | |
%244 = vector.shape_cast %242 : vector<1x1x4x1xf16> to vector<4xf16> | |
%245 = amdgpu.mfma %243 * %244 + %240 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%246 = vector.extract %120[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%247 = vector.extract %184[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%248 = vector.shape_cast %246 : vector<1x1x1x4xf16> to vector<4xf16> | |
%249 = vector.shape_cast %247 : vector<1x1x4x1xf16> to vector<4xf16> | |
%250 = amdgpu.mfma %248 * %249 + %245 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%251 = vector.extract %120[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%252 = vector.extract %184[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%253 = vector.shape_cast %251 : vector<1x1x1x4xf16> to vector<4xf16> | |
%254 = vector.shape_cast %252 : vector<1x1x4x1xf16> to vector<4xf16> | |
%255 = amdgpu.mfma %253 * %254 + %250 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%256 = vector.extract %120[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%257 = vector.extract %184[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%258 = vector.shape_cast %256 : vector<1x1x1x4xf16> to vector<4xf16> | |
%259 = vector.shape_cast %257 : vector<1x1x4x1xf16> to vector<4xf16> | |
%260 = amdgpu.mfma %258 * %259 + %255 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%261 = vector.extract %120[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%262 = vector.extract %184[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%263 = vector.shape_cast %261 : vector<1x1x1x4xf16> to vector<4xf16> | |
%264 = vector.shape_cast %262 : vector<1x1x4x1xf16> to vector<4xf16> | |
%265 = amdgpu.mfma %263 * %264 + %260 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%266 = vector.extract %120[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%267 = vector.extract %184[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%268 = vector.shape_cast %266 : vector<1x1x1x4xf16> to vector<4xf16> | |
%269 = vector.shape_cast %267 : vector<1x1x4x1xf16> to vector<4xf16> | |
%270 = amdgpu.mfma %268 * %269 + %265 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%271 = vector.shape_cast %270 : vector<4xf32> to vector<1x1x4x1xf32> | |
%272 = vector.insert %271, %228 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%273 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%274 = vector.extract %120[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%275 = vector.extract %184[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%276 = vector.shape_cast %274 : vector<1x1x1x4xf16> to vector<4xf16> | |
%277 = vector.shape_cast %275 : vector<1x1x4x1xf16> to vector<4xf16> | |
%278 = vector.shape_cast %273 : vector<1x1x4x1xf32> to vector<4xf32> | |
%279 = amdgpu.mfma %276 * %277 + %278 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%280 = vector.extract %120[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%281 = vector.extract %184[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%282 = vector.shape_cast %280 : vector<1x1x1x4xf16> to vector<4xf16> | |
%283 = vector.shape_cast %281 : vector<1x1x4x1xf16> to vector<4xf16> | |
%284 = amdgpu.mfma %282 * %283 + %279 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%285 = vector.extract %120[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%286 = vector.extract %184[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%287 = vector.shape_cast %285 : vector<1x1x1x4xf16> to vector<4xf16> | |
%288 = vector.shape_cast %286 : vector<1x1x4x1xf16> to vector<4xf16> | |
%289 = amdgpu.mfma %287 * %288 + %284 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%290 = vector.extract %120[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%291 = vector.extract %184[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%292 = vector.shape_cast %290 : vector<1x1x1x4xf16> to vector<4xf16> | |
%293 = vector.shape_cast %291 : vector<1x1x4x1xf16> to vector<4xf16> | |
%294 = amdgpu.mfma %292 * %293 + %289 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%295 = vector.extract %120[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%296 = vector.extract %184[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%297 = vector.shape_cast %295 : vector<1x1x1x4xf16> to vector<4xf16> | |
%298 = vector.shape_cast %296 : vector<1x1x4x1xf16> to vector<4xf16> | |
%299 = amdgpu.mfma %297 * %298 + %294 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%300 = vector.extract %120[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%301 = vector.extract %184[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%302 = vector.shape_cast %300 : vector<1x1x1x4xf16> to vector<4xf16> | |
%303 = vector.shape_cast %301 : vector<1x1x4x1xf16> to vector<4xf16> | |
%304 = amdgpu.mfma %302 * %303 + %299 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%305 = vector.extract %120[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%306 = vector.extract %184[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%307 = vector.shape_cast %305 : vector<1x1x1x4xf16> to vector<4xf16> | |
%308 = vector.shape_cast %306 : vector<1x1x4x1xf16> to vector<4xf16> | |
%309 = amdgpu.mfma %307 * %308 + %304 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%310 = vector.extract %120[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%311 = vector.extract %184[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%312 = vector.shape_cast %310 : vector<1x1x1x4xf16> to vector<4xf16> | |
%313 = vector.shape_cast %311 : vector<1x1x4x1xf16> to vector<4xf16> | |
%314 = amdgpu.mfma %312 * %313 + %309 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%315 = vector.shape_cast %314 : vector<4xf32> to vector<1x1x4x1xf32> | |
%316 = vector.insert %315, %272 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%317 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%318 = vector.extract %120[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%319 = vector.extract %184[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%320 = vector.shape_cast %318 : vector<1x1x1x4xf16> to vector<4xf16> | |
%321 = vector.shape_cast %319 : vector<1x1x4x1xf16> to vector<4xf16> | |
%322 = vector.shape_cast %317 : vector<1x1x4x1xf32> to vector<4xf32> | |
%323 = amdgpu.mfma %320 * %321 + %322 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%324 = vector.extract %120[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%325 = vector.extract %184[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%326 = vector.shape_cast %324 : vector<1x1x1x4xf16> to vector<4xf16> | |
%327 = vector.shape_cast %325 : vector<1x1x4x1xf16> to vector<4xf16> | |
%328 = amdgpu.mfma %326 * %327 + %323 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%329 = vector.extract %120[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%330 = vector.extract %184[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%331 = vector.shape_cast %329 : vector<1x1x1x4xf16> to vector<4xf16> | |
%332 = vector.shape_cast %330 : vector<1x1x4x1xf16> to vector<4xf16> | |
%333 = amdgpu.mfma %331 * %332 + %328 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%334 = vector.extract %120[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%335 = vector.extract %184[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%336 = vector.shape_cast %334 : vector<1x1x1x4xf16> to vector<4xf16> | |
%337 = vector.shape_cast %335 : vector<1x1x4x1xf16> to vector<4xf16> | |
%338 = amdgpu.mfma %336 * %337 + %333 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%339 = vector.extract %120[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%340 = vector.extract %184[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%341 = vector.shape_cast %339 : vector<1x1x1x4xf16> to vector<4xf16> | |
%342 = vector.shape_cast %340 : vector<1x1x4x1xf16> to vector<4xf16> | |
%343 = amdgpu.mfma %341 * %342 + %338 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%344 = vector.extract %120[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%345 = vector.extract %184[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%346 = vector.shape_cast %344 : vector<1x1x1x4xf16> to vector<4xf16> | |
%347 = vector.shape_cast %345 : vector<1x1x4x1xf16> to vector<4xf16> | |
%348 = amdgpu.mfma %346 * %347 + %343 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%349 = vector.extract %120[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%350 = vector.extract %184[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%351 = vector.shape_cast %349 : vector<1x1x1x4xf16> to vector<4xf16> | |
%352 = vector.shape_cast %350 : vector<1x1x4x1xf16> to vector<4xf16> | |
%353 = amdgpu.mfma %351 * %352 + %348 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%354 = vector.extract %120[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%355 = vector.extract %184[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%356 = vector.shape_cast %354 : vector<1x1x1x4xf16> to vector<4xf16> | |
%357 = vector.shape_cast %355 : vector<1x1x4x1xf16> to vector<4xf16> | |
%358 = amdgpu.mfma %356 * %357 + %353 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%359 = vector.shape_cast %358 : vector<4xf32> to vector<1x1x4x1xf32> | |
%360 = vector.insert %359, %316 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
scf.yield %360 : vector<2x2x1x1x4x1xf32> | |
} | |
%5 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%6 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%7 = vector.extract %4[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %7, %subview[%5, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%8 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%0] | |
%9 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%10 = vector.extract %4[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %10, %subview[%8, %9] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%0] | |
%13 = vector.extract %4[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %13, %subview[%11, %12] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%14 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%0] | |
%15 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%0] | |
%16 = vector.extract %4[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %16, %subview[%14, %15] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.dealloc %alloc_3 : memref<64x128xf16, #gpu.address_space<workgroup>> | |
memref.dealloc %alloc : memref<128x64xf16, #gpu.address_space<workgroup>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%thread_id_x = gpu.thread_id x | |
%alloc = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
%alloc_3 = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) { | |
gpu.barrier | |
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x] | |
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x] | |
%18 = vector.transfer_read %0[%16, %17], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x] | |
%20 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x] | |
%21 = vector.transfer_read %0[%19, %20], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x] | |
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x] | |
%24 = vector.transfer_read %0[%22, %23], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%25 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x] | |
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x] | |
%27 = vector.transfer_read %0[%25, %26], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x] | |
%29 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x] | |
%30 = vector.transfer_read %1[%28, %29], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%31 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x] | |
%32 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x] | |
%33 = vector.transfer_read %1[%31, %32], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%34 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x] | |
%35 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x] | |
%36 = vector.transfer_read %1[%34, %35], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%37 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x] | |
%38 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x] | |
%39 = vector.transfer_read %1[%37, %38], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%40 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x] | |
%41 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %18, %alloc_3[%40, %41] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%42 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x] | |
%43 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %21, %alloc_3[%42, %43] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x] | |
%45 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %24, %alloc_3[%44, %45] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%46 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x] | |
%47 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %27, %alloc_3[%46, %47] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%48 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x] | |
%49 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %30, %alloc[%48, %49] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x] | |
%51 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %33, %alloc[%50, %51] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
%52 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x] | |
%53 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %36, %alloc[%52, %53] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
%54 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x] | |
%55 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %39, %alloc[%54, %55] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%56 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%57 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%58 = vector.transfer_read %alloc_3[%56, %57], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%59 = vector.insert_strided_slice %58, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%60 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%61 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%62 = vector.transfer_read %alloc_3[%60, %61], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%63 = vector.insert_strided_slice %62, %59 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%64 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%65 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%66 = vector.transfer_read %alloc_3[%64, %65], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%67 = vector.insert_strided_slice %66, %63 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%68 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%69 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%70 = vector.transfer_read %alloc_3[%68, %69], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%71 = vector.insert_strided_slice %70, %67 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%72 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%73 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%74 = vector.transfer_read %alloc_3[%72, %73], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%75 = vector.insert_strided_slice %74, %71 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%76 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%77 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%78 = vector.transfer_read %alloc_3[%76, %77], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%79 = vector.insert_strided_slice %78, %75 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%80 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%81 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%82 = vector.transfer_read %alloc_3[%80, %81], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%83 = vector.insert_strided_slice %82, %79 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%84 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%85 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%86 = vector.transfer_read %alloc_3[%84, %85], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%87 = vector.insert_strided_slice %86, %83 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%88 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%89 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%90 = vector.transfer_read %alloc_3[%88, %89], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%91 = vector.insert_strided_slice %90, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%92 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%93 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%94 = vector.transfer_read %alloc_3[%92, %93], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%95 = vector.insert_strided_slice %94, %91 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%96 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%97 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%98 = vector.transfer_read %alloc_3[%96, %97], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%99 = vector.insert_strided_slice %98, %95 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%100 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%101 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%102 = vector.transfer_read %alloc_3[%100, %101], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%103 = vector.insert_strided_slice %102, %99 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%104 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%105 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%106 = vector.transfer_read %alloc_3[%104, %105], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%107 = vector.insert_strided_slice %106, %103 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%108 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%109 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%110 = vector.transfer_read %alloc_3[%108, %109], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%111 = vector.insert_strided_slice %110, %107 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%112 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%113 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%114 = vector.transfer_read %alloc_3[%112, %113], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%115 = vector.insert_strided_slice %114, %111 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%116 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%117 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%118 = vector.transfer_read %alloc_3[%116, %117], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%119 = vector.insert_strided_slice %118, %115 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%120 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%121 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%122 = vector.transfer_read %alloc[%120, %121], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%123 = vector.insert_strided_slice %122, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%124 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%125 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%126 = vector.transfer_read %alloc[%124, %125], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%127 = vector.insert_strided_slice %126, %123 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%128 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%129 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%130 = vector.transfer_read %alloc[%128, %129], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%131 = vector.insert_strided_slice %130, %127 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%132 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%133 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%134 = vector.transfer_read %alloc[%132, %133], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%135 = vector.insert_strided_slice %134, %131 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%136 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%137 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%138 = vector.transfer_read %alloc[%136, %137], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%139 = vector.insert_strided_slice %138, %135 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%140 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%141 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%142 = vector.transfer_read %alloc[%140, %141], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%143 = vector.insert_strided_slice %142, %139 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%144 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%145 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%146 = vector.transfer_read %alloc[%144, %145], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%147 = vector.insert_strided_slice %146, %143 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%148 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%149 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%150 = vector.transfer_read %alloc[%148, %149], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%151 = vector.insert_strided_slice %150, %147 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%152 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%153 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%154 = vector.transfer_read %alloc[%152, %153], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%155 = vector.insert_strided_slice %154, %151 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%156 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%157 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%158 = vector.transfer_read %alloc[%156, %157], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%159 = vector.insert_strided_slice %158, %155 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%160 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%161 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%162 = vector.transfer_read %alloc[%160, %161], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%163 = vector.insert_strided_slice %162, %159 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%164 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%165 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%166 = vector.transfer_read %alloc[%164, %165], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%167 = vector.insert_strided_slice %166, %163 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%168 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%169 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%170 = vector.transfer_read %alloc[%168, %169], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%171 = vector.insert_strided_slice %170, %167 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%172 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%173 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%174 = vector.transfer_read %alloc[%172, %173], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%175 = vector.insert_strided_slice %174, %171 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%176 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%177 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%178 = vector.transfer_read %alloc[%176, %177], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%179 = vector.insert_strided_slice %178, %175 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%180 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%181 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%182 = vector.transfer_read %alloc[%180, %181], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%183 = vector.insert_strided_slice %182, %179 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%184 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%185 = vector.extract %119[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%186 = vector.extract %183[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%187 = vector.shape_cast %185 : vector<1x1x1x4xf16> to vector<4xf16> | |
%188 = vector.shape_cast %186 : vector<1x1x4x1xf16> to vector<4xf16> | |
%189 = vector.shape_cast %184 : vector<1x1x4x1xf32> to vector<4xf32> | |
%190 = amdgpu.mfma %187 * %188 + %189 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%191 = vector.extract %119[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%192 = vector.extract %183[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%193 = vector.shape_cast %191 : vector<1x1x1x4xf16> to vector<4xf16> | |
%194 = vector.shape_cast %192 : vector<1x1x4x1xf16> to vector<4xf16> | |
%195 = amdgpu.mfma %193 * %194 + %190 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%196 = vector.extract %119[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%197 = vector.extract %183[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%198 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16> | |
%199 = vector.shape_cast %197 : vector<1x1x4x1xf16> to vector<4xf16> | |
%200 = amdgpu.mfma %198 * %199 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%201 = vector.extract %119[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%202 = vector.extract %183[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%203 = vector.shape_cast %201 : vector<1x1x1x4xf16> to vector<4xf16> | |
%204 = vector.shape_cast %202 : vector<1x1x4x1xf16> to vector<4xf16> | |
%205 = amdgpu.mfma %203 * %204 + %200 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%206 = vector.extract %119[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%207 = vector.extract %183[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%208 = vector.shape_cast %206 : vector<1x1x1x4xf16> to vector<4xf16> | |
%209 = vector.shape_cast %207 : vector<1x1x4x1xf16> to vector<4xf16> | |
%210 = amdgpu.mfma %208 * %209 + %205 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%211 = vector.extract %119[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%212 = vector.extract %183[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%213 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16> | |
%214 = vector.shape_cast %212 : vector<1x1x4x1xf16> to vector<4xf16> | |
%215 = amdgpu.mfma %213 * %214 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%216 = vector.extract %119[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%217 = vector.extract %183[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%218 = vector.shape_cast %216 : vector<1x1x1x4xf16> to vector<4xf16> | |
%219 = vector.shape_cast %217 : vector<1x1x4x1xf16> to vector<4xf16> | |
%220 = amdgpu.mfma %218 * %219 + %215 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%221 = vector.extract %119[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%222 = vector.extract %183[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%223 = vector.shape_cast %221 : vector<1x1x1x4xf16> to vector<4xf16> | |
%224 = vector.shape_cast %222 : vector<1x1x4x1xf16> to vector<4xf16> | |
%225 = amdgpu.mfma %223 * %224 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32> | |
%227 = vector.insert %226, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%228 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%229 = vector.extract %119[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%230 = vector.extract %183[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%231 = vector.shape_cast %229 : vector<1x1x1x4xf16> to vector<4xf16> | |
%232 = vector.shape_cast %230 : vector<1x1x4x1xf16> to vector<4xf16> | |
%233 = vector.shape_cast %228 : vector<1x1x4x1xf32> to vector<4xf32> | |
%234 = amdgpu.mfma %231 * %232 + %233 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%235 = vector.extract %119[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%236 = vector.extract %183[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%237 = vector.shape_cast %235 : vector<1x1x1x4xf16> to vector<4xf16> | |
%238 = vector.shape_cast %236 : vector<1x1x4x1xf16> to vector<4xf16> | |
%239 = amdgpu.mfma %237 * %238 + %234 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%240 = vector.extract %119[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%241 = vector.extract %183[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%242 = vector.shape_cast %240 : vector<1x1x1x4xf16> to vector<4xf16> | |
%243 = vector.shape_cast %241 : vector<1x1x4x1xf16> to vector<4xf16> | |
%244 = amdgpu.mfma %242 * %243 + %239 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%245 = vector.extract %119[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%246 = vector.extract %183[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%247 = vector.shape_cast %245 : vector<1x1x1x4xf16> to vector<4xf16> | |
%248 = vector.shape_cast %246 : vector<1x1x4x1xf16> to vector<4xf16> | |
%249 = amdgpu.mfma %247 * %248 + %244 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%250 = vector.extract %119[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%251 = vector.extract %183[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%252 = vector.shape_cast %250 : vector<1x1x1x4xf16> to vector<4xf16> | |
%253 = vector.shape_cast %251 : vector<1x1x4x1xf16> to vector<4xf16> | |
%254 = amdgpu.mfma %252 * %253 + %249 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%255 = vector.extract %119[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%256 = vector.extract %183[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%257 = vector.shape_cast %255 : vector<1x1x1x4xf16> to vector<4xf16> | |
%258 = vector.shape_cast %256 : vector<1x1x4x1xf16> to vector<4xf16> | |
%259 = amdgpu.mfma %257 * %258 + %254 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%260 = vector.extract %119[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%261 = vector.extract %183[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%262 = vector.shape_cast %260 : vector<1x1x1x4xf16> to vector<4xf16> | |
%263 = vector.shape_cast %261 : vector<1x1x4x1xf16> to vector<4xf16> | |
%264 = amdgpu.mfma %262 * %263 + %259 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%265 = vector.extract %119[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%266 = vector.extract %183[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%267 = vector.shape_cast %265 : vector<1x1x1x4xf16> to vector<4xf16> | |
%268 = vector.shape_cast %266 : vector<1x1x4x1xf16> to vector<4xf16> | |
%269 = amdgpu.mfma %267 * %268 + %264 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%270 = vector.shape_cast %269 : vector<4xf32> to vector<1x1x4x1xf32> | |
%271 = vector.insert %270, %227 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%272 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%273 = vector.extract %119[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%274 = vector.extract %183[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%275 = vector.shape_cast %273 : vector<1x1x1x4xf16> to vector<4xf16> | |
%276 = vector.shape_cast %274 : vector<1x1x4x1xf16> to vector<4xf16> | |
%277 = vector.shape_cast %272 : vector<1x1x4x1xf32> to vector<4xf32> | |
%278 = amdgpu.mfma %275 * %276 + %277 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%279 = vector.extract %119[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%280 = vector.extract %183[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%281 = vector.shape_cast %279 : vector<1x1x1x4xf16> to vector<4xf16> | |
%282 = vector.shape_cast %280 : vector<1x1x4x1xf16> to vector<4xf16> | |
%283 = amdgpu.mfma %281 * %282 + %278 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%284 = vector.extract %119[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%285 = vector.extract %183[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%286 = vector.shape_cast %284 : vector<1x1x1x4xf16> to vector<4xf16> | |
%287 = vector.shape_cast %285 : vector<1x1x4x1xf16> to vector<4xf16> | |
%288 = amdgpu.mfma %286 * %287 + %283 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%289 = vector.extract %119[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%290 = vector.extract %183[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%291 = vector.shape_cast %289 : vector<1x1x1x4xf16> to vector<4xf16> | |
%292 = vector.shape_cast %290 : vector<1x1x4x1xf16> to vector<4xf16> | |
%293 = amdgpu.mfma %291 * %292 + %288 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%294 = vector.extract %119[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%295 = vector.extract %183[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%296 = vector.shape_cast %294 : vector<1x1x1x4xf16> to vector<4xf16> | |
%297 = vector.shape_cast %295 : vector<1x1x4x1xf16> to vector<4xf16> | |
%298 = amdgpu.mfma %296 * %297 + %293 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%299 = vector.extract %119[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%300 = vector.extract %183[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%301 = vector.shape_cast %299 : vector<1x1x1x4xf16> to vector<4xf16> | |
%302 = vector.shape_cast %300 : vector<1x1x4x1xf16> to vector<4xf16> | |
%303 = amdgpu.mfma %301 * %302 + %298 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%304 = vector.extract %119[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%305 = vector.extract %183[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%306 = vector.shape_cast %304 : vector<1x1x1x4xf16> to vector<4xf16> | |
%307 = vector.shape_cast %305 : vector<1x1x4x1xf16> to vector<4xf16> | |
%308 = amdgpu.mfma %306 * %307 + %303 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%309 = vector.extract %119[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%310 = vector.extract %183[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%311 = vector.shape_cast %309 : vector<1x1x1x4xf16> to vector<4xf16> | |
%312 = vector.shape_cast %310 : vector<1x1x4x1xf16> to vector<4xf16> | |
%313 = amdgpu.mfma %311 * %312 + %308 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%314 = vector.shape_cast %313 : vector<4xf32> to vector<1x1x4x1xf32> | |
%315 = vector.insert %314, %271 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%316 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%317 = vector.extract %119[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%318 = vector.extract %183[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%319 = vector.shape_cast %317 : vector<1x1x1x4xf16> to vector<4xf16> | |
%320 = vector.shape_cast %318 : vector<1x1x4x1xf16> to vector<4xf16> | |
%321 = vector.shape_cast %316 : vector<1x1x4x1xf32> to vector<4xf32> | |
%322 = amdgpu.mfma %319 * %320 + %321 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%323 = vector.extract %119[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%324 = vector.extract %183[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%325 = vector.shape_cast %323 : vector<1x1x1x4xf16> to vector<4xf16> | |
%326 = vector.shape_cast %324 : vector<1x1x4x1xf16> to vector<4xf16> | |
%327 = amdgpu.mfma %325 * %326 + %322 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%328 = vector.extract %119[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%329 = vector.extract %183[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%330 = vector.shape_cast %328 : vector<1x1x1x4xf16> to vector<4xf16> | |
%331 = vector.shape_cast %329 : vector<1x1x4x1xf16> to vector<4xf16> | |
%332 = amdgpu.mfma %330 * %331 + %327 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%333 = vector.extract %119[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%334 = vector.extract %183[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%335 = vector.shape_cast %333 : vector<1x1x1x4xf16> to vector<4xf16> | |
%336 = vector.shape_cast %334 : vector<1x1x4x1xf16> to vector<4xf16> | |
%337 = amdgpu.mfma %335 * %336 + %332 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%338 = vector.extract %119[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%339 = vector.extract %183[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%340 = vector.shape_cast %338 : vector<1x1x1x4xf16> to vector<4xf16> | |
%341 = vector.shape_cast %339 : vector<1x1x4x1xf16> to vector<4xf16> | |
%342 = amdgpu.mfma %340 * %341 + %337 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%343 = vector.extract %119[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%344 = vector.extract %183[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%345 = vector.shape_cast %343 : vector<1x1x1x4xf16> to vector<4xf16> | |
%346 = vector.shape_cast %344 : vector<1x1x4x1xf16> to vector<4xf16> | |
%347 = amdgpu.mfma %345 * %346 + %342 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%348 = vector.extract %119[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%349 = vector.extract %183[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%350 = vector.shape_cast %348 : vector<1x1x1x4xf16> to vector<4xf16> | |
%351 = vector.shape_cast %349 : vector<1x1x4x1xf16> to vector<4xf16> | |
%352 = amdgpu.mfma %350 * %351 + %347 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%353 = vector.extract %119[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%354 = vector.extract %183[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%355 = vector.shape_cast %353 : vector<1x1x1x4xf16> to vector<4xf16> | |
%356 = vector.shape_cast %354 : vector<1x1x4x1xf16> to vector<4xf16> | |
%357 = amdgpu.mfma %355 * %356 + %352 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%358 = vector.shape_cast %357 : vector<4xf32> to vector<1x1x4x1xf32> | |
%359 = vector.insert %358, %315 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
scf.yield %359 : vector<2x2x1x1x4x1xf32> | |
} | |
%4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%6 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %6, %subview[%4, %5] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%7 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%8 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%9 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %9, %subview[%7, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%10 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%11 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%12 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %12, %subview[%10, %11] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%13 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%14 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%15 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %15, %subview[%13, %14] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.dealloc %alloc_3 : memref<64x128xf16, #gpu.address_space<workgroup>> | |
memref.dealloc %alloc : memref<128x64xf16, #gpu.address_space<workgroup>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%thread_id_x = gpu.thread_id x | |
%alloc = memref.alloc() : memref<128x64xf16, #gpu.address_space<workgroup>> | |
%alloc_3 = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) { | |
gpu.barrier | |
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x] | |
%13 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x] | |
%14 = vector.transfer_read %0[%12, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x] | |
%16 = vector.transfer_read %0[%15, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x] | |
%18 = vector.transfer_read %0[%17, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x] | |
%20 = vector.transfer_read %0[%19, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x] | |
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x] | |
%23 = vector.transfer_read %1[%21, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x] | |
%25 = vector.transfer_read %1[%24, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x] | |
%27 = vector.transfer_read %1[%26, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x] | |
%29 = vector.transfer_read %1[%28, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x] | |
%31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %14, %alloc_3[%30, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x] | |
vector.transfer_write %16, %alloc_3[%32, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%33 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x] | |
vector.transfer_write %18, %alloc_3[%33, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x] | |
vector.transfer_write %20, %alloc_3[%34, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, #gpu.address_space<workgroup>> | |
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x] | |
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %23, %alloc[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x] | |
vector.transfer_write %25, %alloc[%37, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x] | |
vector.transfer_write %27, %alloc[%38, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
%39 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x] | |
vector.transfer_write %29, %alloc[%39, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%42 = vector.transfer_read %alloc_3[%40, %41], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%43 = vector.insert_strided_slice %42, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%45 = vector.transfer_read %alloc_3[%40, %44], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%46 = vector.insert_strided_slice %45, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%48 = vector.transfer_read %alloc_3[%40, %47], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%49 = vector.insert_strided_slice %48, %46 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%51 = vector.transfer_read %alloc_3[%40, %50], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%52 = vector.insert_strided_slice %51, %49 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%54 = vector.transfer_read %alloc_3[%40, %53], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%55 = vector.insert_strided_slice %54, %52 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%56 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%57 = vector.transfer_read %alloc_3[%40, %56], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%60 = vector.transfer_read %alloc_3[%40, %59], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%63 = vector.transfer_read %alloc_3[%40, %62], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%66 = vector.transfer_read %alloc_3[%65, %41], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%67 = vector.insert_strided_slice %66, %64 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%68 = vector.transfer_read %alloc_3[%65, %44], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%69 = vector.insert_strided_slice %68, %67 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%70 = vector.transfer_read %alloc_3[%65, %47], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%71 = vector.insert_strided_slice %70, %69 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%72 = vector.transfer_read %alloc_3[%65, %50], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%73 = vector.insert_strided_slice %72, %71 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%74 = vector.transfer_read %alloc_3[%65, %53], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%75 = vector.insert_strided_slice %74, %73 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%76 = vector.transfer_read %alloc_3[%65, %56], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%77 = vector.insert_strided_slice %76, %75 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%78 = vector.transfer_read %alloc_3[%65, %59], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%79 = vector.insert_strided_slice %78, %77 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%80 = vector.transfer_read %alloc_3[%65, %62], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%81 = vector.insert_strided_slice %80, %79 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%83 = vector.transfer_read %alloc[%41, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%84 = vector.insert_strided_slice %83, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%86 = vector.transfer_read %alloc[%41, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%87 = vector.insert_strided_slice %86, %84 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%88 = vector.transfer_read %alloc[%44, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%89 = vector.insert_strided_slice %88, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%90 = vector.transfer_read %alloc[%44, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%91 = vector.insert_strided_slice %90, %89 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%92 = vector.transfer_read %alloc[%47, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%93 = vector.insert_strided_slice %92, %91 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%94 = vector.transfer_read %alloc[%47, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%95 = vector.insert_strided_slice %94, %93 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%96 = vector.transfer_read %alloc[%50, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%97 = vector.insert_strided_slice %96, %95 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%98 = vector.transfer_read %alloc[%50, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%99 = vector.insert_strided_slice %98, %97 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%100 = vector.transfer_read %alloc[%53, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%101 = vector.insert_strided_slice %100, %99 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%102 = vector.transfer_read %alloc[%53, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%103 = vector.insert_strided_slice %102, %101 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%104 = vector.transfer_read %alloc[%56, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%105 = vector.insert_strided_slice %104, %103 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%106 = vector.transfer_read %alloc[%56, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%107 = vector.insert_strided_slice %106, %105 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%108 = vector.transfer_read %alloc[%59, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%109 = vector.insert_strided_slice %108, %107 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%110 = vector.transfer_read %alloc[%59, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%111 = vector.insert_strided_slice %110, %109 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%112 = vector.transfer_read %alloc[%62, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%113 = vector.insert_strided_slice %112, %111 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%114 = vector.transfer_read %alloc[%62, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%115 = vector.insert_strided_slice %114, %113 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%116 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%117 = vector.extract %81[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%118 = vector.extract %115[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%119 = vector.shape_cast %117 : vector<1x1x1x4xf16> to vector<4xf16> | |
%120 = vector.shape_cast %118 : vector<1x1x4x1xf16> to vector<4xf16> | |
%121 = vector.shape_cast %116 : vector<1x1x4x1xf32> to vector<4xf32> | |
%122 = amdgpu.mfma %119 * %120 + %121 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%123 = vector.extract %81[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%124 = vector.extract %115[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%125 = vector.shape_cast %123 : vector<1x1x1x4xf16> to vector<4xf16> | |
%126 = vector.shape_cast %124 : vector<1x1x4x1xf16> to vector<4xf16> | |
%127 = amdgpu.mfma %125 * %126 + %122 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%128 = vector.extract %81[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%129 = vector.extract %115[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%130 = vector.shape_cast %128 : vector<1x1x1x4xf16> to vector<4xf16> | |
%131 = vector.shape_cast %129 : vector<1x1x4x1xf16> to vector<4xf16> | |
%132 = amdgpu.mfma %130 * %131 + %127 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%133 = vector.extract %81[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%134 = vector.extract %115[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%135 = vector.shape_cast %133 : vector<1x1x1x4xf16> to vector<4xf16> | |
%136 = vector.shape_cast %134 : vector<1x1x4x1xf16> to vector<4xf16> | |
%137 = amdgpu.mfma %135 * %136 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%138 = vector.extract %81[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%139 = vector.extract %115[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%140 = vector.shape_cast %138 : vector<1x1x1x4xf16> to vector<4xf16> | |
%141 = vector.shape_cast %139 : vector<1x1x4x1xf16> to vector<4xf16> | |
%142 = amdgpu.mfma %140 * %141 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%143 = vector.extract %81[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%144 = vector.extract %115[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%145 = vector.shape_cast %143 : vector<1x1x1x4xf16> to vector<4xf16> | |
%146 = vector.shape_cast %144 : vector<1x1x4x1xf16> to vector<4xf16> | |
%147 = amdgpu.mfma %145 * %146 + %142 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%148 = vector.extract %81[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%149 = vector.extract %115[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%150 = vector.shape_cast %148 : vector<1x1x1x4xf16> to vector<4xf16> | |
%151 = vector.shape_cast %149 : vector<1x1x4x1xf16> to vector<4xf16> | |
%152 = amdgpu.mfma %150 * %151 + %147 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%153 = vector.extract %81[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%154 = vector.extract %115[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%155 = vector.shape_cast %153 : vector<1x1x1x4xf16> to vector<4xf16> | |
%156 = vector.shape_cast %154 : vector<1x1x4x1xf16> to vector<4xf16> | |
%157 = amdgpu.mfma %155 * %156 + %152 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%158 = vector.shape_cast %157 : vector<4xf32> to vector<1x1x4x1xf32> | |
%159 = vector.insert %158, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%160 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%161 = vector.extract %115[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%162 = vector.shape_cast %161 : vector<1x1x4x1xf16> to vector<4xf16> | |
%163 = vector.shape_cast %160 : vector<1x1x4x1xf32> to vector<4xf32> | |
%164 = amdgpu.mfma %119 * %162 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%165 = vector.extract %115[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%166 = vector.shape_cast %165 : vector<1x1x4x1xf16> to vector<4xf16> | |
%167 = amdgpu.mfma %125 * %166 + %164 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%168 = vector.extract %115[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%169 = vector.shape_cast %168 : vector<1x1x4x1xf16> to vector<4xf16> | |
%170 = amdgpu.mfma %130 * %169 + %167 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%171 = vector.extract %115[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%172 = vector.shape_cast %171 : vector<1x1x4x1xf16> to vector<4xf16> | |
%173 = amdgpu.mfma %135 * %172 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%174 = vector.extract %115[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%175 = vector.shape_cast %174 : vector<1x1x4x1xf16> to vector<4xf16> | |
%176 = amdgpu.mfma %140 * %175 + %173 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%177 = vector.extract %115[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%178 = vector.shape_cast %177 : vector<1x1x4x1xf16> to vector<4xf16> | |
%179 = amdgpu.mfma %145 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%180 = vector.extract %115[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%181 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16> | |
%182 = amdgpu.mfma %150 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%183 = vector.extract %115[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%184 = vector.shape_cast %183 : vector<1x1x4x1xf16> to vector<4xf16> | |
%185 = amdgpu.mfma %155 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x4x1xf32> | |
%187 = vector.insert %186, %159 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%188 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%189 = vector.extract %81[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%190 = vector.shape_cast %189 : vector<1x1x1x4xf16> to vector<4xf16> | |
%191 = vector.shape_cast %188 : vector<1x1x4x1xf32> to vector<4xf32> | |
%192 = amdgpu.mfma %190 * %120 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%193 = vector.extract %81[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%194 = vector.shape_cast %193 : vector<1x1x1x4xf16> to vector<4xf16> | |
%195 = amdgpu.mfma %194 * %126 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%196 = vector.extract %81[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%197 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16> | |
%198 = amdgpu.mfma %197 * %131 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%199 = vector.extract %81[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%200 = vector.shape_cast %199 : vector<1x1x1x4xf16> to vector<4xf16> | |
%201 = amdgpu.mfma %200 * %136 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%202 = vector.extract %81[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%203 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16> | |
%204 = amdgpu.mfma %203 * %141 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%205 = vector.extract %81[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%206 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16> | |
%207 = amdgpu.mfma %206 * %146 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%208 = vector.extract %81[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%209 = vector.shape_cast %208 : vector<1x1x1x4xf16> to vector<4xf16> | |
%210 = amdgpu.mfma %209 * %151 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%211 = vector.extract %81[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%212 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16> | |
%213 = amdgpu.mfma %212 * %156 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%214 = vector.shape_cast %213 : vector<4xf32> to vector<1x1x4x1xf32> | |
%215 = vector.insert %214, %187 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%216 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%217 = vector.shape_cast %216 : vector<1x1x4x1xf32> to vector<4xf32> | |
%218 = amdgpu.mfma %190 * %162 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%219 = amdgpu.mfma %194 * %166 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%220 = amdgpu.mfma %197 * %169 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%221 = amdgpu.mfma %200 * %172 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%222 = amdgpu.mfma %203 * %175 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%223 = amdgpu.mfma %206 * %178 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%224 = amdgpu.mfma %209 * %181 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%225 = amdgpu.mfma %212 * %184 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32> | |
%227 = vector.insert %226, %215 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
scf.yield %227 : vector<2x2x1x1x4x1xf32> | |
} | |
%4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%6 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %6, %subview[%4, %5] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%7 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%8 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %8, %subview[%4, %7] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%10 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %10, %subview[%9, %5] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %11, %subview[%9, %7] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.dealloc %alloc_3 : memref<64x128xf16, #gpu.address_space<workgroup>> | |
memref.dealloc %alloc : memref<128x64xf16, #gpu.address_space<workgroup>> | |
return | |
} | |
// -----// IR Dump After GPUReduceBankConflictsPass (iree-codegen-gpu-reduce-bank-conflicts) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%thread_id_x = gpu.thread_id x | |
%alloc = memref.alloc() : memref<128x68xf16, #gpu.address_space<workgroup>> | |
%subview = memref.subview %alloc[0, 0] [128, 64] [1, 1] : memref<128x68xf16, #gpu.address_space<workgroup>> to memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
%alloc_3 = memref.alloc() : memref<64x132xf16, #gpu.address_space<workgroup>> | |
%subview_4 = memref.subview %alloc_3[0, 0] [64, 128] [1, 1] : memref<64x132xf16, #gpu.address_space<workgroup>> to memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview_5 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) { | |
gpu.barrier | |
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x] | |
%13 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x] | |
%14 = vector.transfer_read %0[%12, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x] | |
%16 = vector.transfer_read %0[%15, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x] | |
%18 = vector.transfer_read %0[%17, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x] | |
%20 = vector.transfer_read %0[%19, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x] | |
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x] | |
%23 = vector.transfer_read %1[%21, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x] | |
%25 = vector.transfer_read %1[%24, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x] | |
%27 = vector.transfer_read %1[%26, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x] | |
%29 = vector.transfer_read %1[%28, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x] | |
%31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %14, %subview_4[%30, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x] | |
vector.transfer_write %16, %subview_4[%32, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%33 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x] | |
vector.transfer_write %18, %subview_4[%33, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x] | |
vector.transfer_write %20, %subview_4[%34, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x] | |
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %23, %subview[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x] | |
vector.transfer_write %25, %subview[%37, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x] | |
vector.transfer_write %27, %subview[%38, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
%39 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x] | |
vector.transfer_write %29, %subview[%39, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%42 = vector.transfer_read %subview_4[%40, %41], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%43 = vector.insert_strided_slice %42, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%45 = vector.transfer_read %subview_4[%40, %44], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%46 = vector.insert_strided_slice %45, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%48 = vector.transfer_read %subview_4[%40, %47], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%49 = vector.insert_strided_slice %48, %46 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%51 = vector.transfer_read %subview_4[%40, %50], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%52 = vector.insert_strided_slice %51, %49 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%54 = vector.transfer_read %subview_4[%40, %53], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%55 = vector.insert_strided_slice %54, %52 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%56 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%57 = vector.transfer_read %subview_4[%40, %56], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%60 = vector.transfer_read %subview_4[%40, %59], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%63 = vector.transfer_read %subview_4[%40, %62], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%66 = vector.transfer_read %subview_4[%65, %41], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%67 = vector.insert_strided_slice %66, %64 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%68 = vector.transfer_read %subview_4[%65, %44], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%69 = vector.insert_strided_slice %68, %67 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%70 = vector.transfer_read %subview_4[%65, %47], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%71 = vector.insert_strided_slice %70, %69 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%72 = vector.transfer_read %subview_4[%65, %50], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%73 = vector.insert_strided_slice %72, %71 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%74 = vector.transfer_read %subview_4[%65, %53], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%75 = vector.insert_strided_slice %74, %73 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%76 = vector.transfer_read %subview_4[%65, %56], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%77 = vector.insert_strided_slice %76, %75 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%78 = vector.transfer_read %subview_4[%65, %59], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%79 = vector.insert_strided_slice %78, %77 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%80 = vector.transfer_read %subview_4[%65, %62], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%81 = vector.insert_strided_slice %80, %79 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%83 = vector.transfer_read %subview[%41, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%84 = vector.insert_strided_slice %83, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%86 = vector.transfer_read %subview[%41, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%87 = vector.insert_strided_slice %86, %84 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%88 = vector.transfer_read %subview[%44, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%89 = vector.insert_strided_slice %88, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%90 = vector.transfer_read %subview[%44, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%91 = vector.insert_strided_slice %90, %89 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%92 = vector.transfer_read %subview[%47, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%93 = vector.insert_strided_slice %92, %91 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%94 = vector.transfer_read %subview[%47, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%95 = vector.insert_strided_slice %94, %93 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%96 = vector.transfer_read %subview[%50, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%97 = vector.insert_strided_slice %96, %95 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%98 = vector.transfer_read %subview[%50, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%99 = vector.insert_strided_slice %98, %97 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%100 = vector.transfer_read %subview[%53, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%101 = vector.insert_strided_slice %100, %99 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%102 = vector.transfer_read %subview[%53, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%103 = vector.insert_strided_slice %102, %101 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%104 = vector.transfer_read %subview[%56, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%105 = vector.insert_strided_slice %104, %103 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%106 = vector.transfer_read %subview[%56, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%107 = vector.insert_strided_slice %106, %105 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%108 = vector.transfer_read %subview[%59, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%109 = vector.insert_strided_slice %108, %107 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%110 = vector.transfer_read %subview[%59, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%111 = vector.insert_strided_slice %110, %109 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%112 = vector.transfer_read %subview[%62, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%113 = vector.insert_strided_slice %112, %111 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%114 = vector.transfer_read %subview[%62, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%115 = vector.insert_strided_slice %114, %113 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%116 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%117 = vector.extract %81[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%118 = vector.extract %115[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%119 = vector.shape_cast %117 : vector<1x1x1x4xf16> to vector<4xf16> | |
%120 = vector.shape_cast %118 : vector<1x1x4x1xf16> to vector<4xf16> | |
%121 = vector.shape_cast %116 : vector<1x1x4x1xf32> to vector<4xf32> | |
%122 = amdgpu.mfma %119 * %120 + %121 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%123 = vector.extract %81[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%124 = vector.extract %115[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%125 = vector.shape_cast %123 : vector<1x1x1x4xf16> to vector<4xf16> | |
%126 = vector.shape_cast %124 : vector<1x1x4x1xf16> to vector<4xf16> | |
%127 = amdgpu.mfma %125 * %126 + %122 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%128 = vector.extract %81[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%129 = vector.extract %115[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%130 = vector.shape_cast %128 : vector<1x1x1x4xf16> to vector<4xf16> | |
%131 = vector.shape_cast %129 : vector<1x1x4x1xf16> to vector<4xf16> | |
%132 = amdgpu.mfma %130 * %131 + %127 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%133 = vector.extract %81[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%134 = vector.extract %115[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%135 = vector.shape_cast %133 : vector<1x1x1x4xf16> to vector<4xf16> | |
%136 = vector.shape_cast %134 : vector<1x1x4x1xf16> to vector<4xf16> | |
%137 = amdgpu.mfma %135 * %136 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%138 = vector.extract %81[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%139 = vector.extract %115[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%140 = vector.shape_cast %138 : vector<1x1x1x4xf16> to vector<4xf16> | |
%141 = vector.shape_cast %139 : vector<1x1x4x1xf16> to vector<4xf16> | |
%142 = amdgpu.mfma %140 * %141 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%143 = vector.extract %81[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%144 = vector.extract %115[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%145 = vector.shape_cast %143 : vector<1x1x1x4xf16> to vector<4xf16> | |
%146 = vector.shape_cast %144 : vector<1x1x4x1xf16> to vector<4xf16> | |
%147 = amdgpu.mfma %145 * %146 + %142 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%148 = vector.extract %81[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%149 = vector.extract %115[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%150 = vector.shape_cast %148 : vector<1x1x1x4xf16> to vector<4xf16> | |
%151 = vector.shape_cast %149 : vector<1x1x4x1xf16> to vector<4xf16> | |
%152 = amdgpu.mfma %150 * %151 + %147 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%153 = vector.extract %81[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%154 = vector.extract %115[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%155 = vector.shape_cast %153 : vector<1x1x1x4xf16> to vector<4xf16> | |
%156 = vector.shape_cast %154 : vector<1x1x4x1xf16> to vector<4xf16> | |
%157 = amdgpu.mfma %155 * %156 + %152 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%158 = vector.shape_cast %157 : vector<4xf32> to vector<1x1x4x1xf32> | |
%159 = vector.insert %158, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%160 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%161 = vector.extract %115[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%162 = vector.shape_cast %161 : vector<1x1x4x1xf16> to vector<4xf16> | |
%163 = vector.shape_cast %160 : vector<1x1x4x1xf32> to vector<4xf32> | |
%164 = amdgpu.mfma %119 * %162 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%165 = vector.extract %115[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%166 = vector.shape_cast %165 : vector<1x1x4x1xf16> to vector<4xf16> | |
%167 = amdgpu.mfma %125 * %166 + %164 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%168 = vector.extract %115[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%169 = vector.shape_cast %168 : vector<1x1x4x1xf16> to vector<4xf16> | |
%170 = amdgpu.mfma %130 * %169 + %167 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%171 = vector.extract %115[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%172 = vector.shape_cast %171 : vector<1x1x4x1xf16> to vector<4xf16> | |
%173 = amdgpu.mfma %135 * %172 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%174 = vector.extract %115[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%175 = vector.shape_cast %174 : vector<1x1x4x1xf16> to vector<4xf16> | |
%176 = amdgpu.mfma %140 * %175 + %173 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%177 = vector.extract %115[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%178 = vector.shape_cast %177 : vector<1x1x4x1xf16> to vector<4xf16> | |
%179 = amdgpu.mfma %145 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%180 = vector.extract %115[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%181 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16> | |
%182 = amdgpu.mfma %150 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%183 = vector.extract %115[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%184 = vector.shape_cast %183 : vector<1x1x4x1xf16> to vector<4xf16> | |
%185 = amdgpu.mfma %155 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x4x1xf32> | |
%187 = vector.insert %186, %159 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%188 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%189 = vector.extract %81[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%190 = vector.shape_cast %189 : vector<1x1x1x4xf16> to vector<4xf16> | |
%191 = vector.shape_cast %188 : vector<1x1x4x1xf32> to vector<4xf32> | |
%192 = amdgpu.mfma %190 * %120 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%193 = vector.extract %81[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%194 = vector.shape_cast %193 : vector<1x1x1x4xf16> to vector<4xf16> | |
%195 = amdgpu.mfma %194 * %126 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%196 = vector.extract %81[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%197 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16> | |
%198 = amdgpu.mfma %197 * %131 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%199 = vector.extract %81[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%200 = vector.shape_cast %199 : vector<1x1x1x4xf16> to vector<4xf16> | |
%201 = amdgpu.mfma %200 * %136 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%202 = vector.extract %81[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%203 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16> | |
%204 = amdgpu.mfma %203 * %141 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%205 = vector.extract %81[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%206 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16> | |
%207 = amdgpu.mfma %206 * %146 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%208 = vector.extract %81[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%209 = vector.shape_cast %208 : vector<1x1x1x4xf16> to vector<4xf16> | |
%210 = amdgpu.mfma %209 * %151 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%211 = vector.extract %81[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%212 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16> | |
%213 = amdgpu.mfma %212 * %156 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%214 = vector.shape_cast %213 : vector<4xf32> to vector<1x1x4x1xf32> | |
%215 = vector.insert %214, %187 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%216 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%217 = vector.shape_cast %216 : vector<1x1x4x1xf32> to vector<4xf32> | |
%218 = amdgpu.mfma %190 * %162 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%219 = amdgpu.mfma %194 * %166 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%220 = amdgpu.mfma %197 * %169 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%221 = amdgpu.mfma %200 * %172 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%222 = amdgpu.mfma %203 * %175 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%223 = amdgpu.mfma %206 * %178 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%224 = amdgpu.mfma %209 * %181 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%225 = amdgpu.mfma %212 * %184 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32> | |
%227 = vector.insert %226, %215 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
scf.yield %227 : vector<2x2x1x1x4x1xf32> | |
} | |
%4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%6 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %6, %subview_5[%4, %5] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%7 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%8 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %8, %subview_5[%4, %7] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%10 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %10, %subview_5[%9, %5] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %11, %subview_5[%9, %7] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.dealloc %subview_4 : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
memref.dealloc %subview : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
return | |
} | |
// -----// IR Dump After LLVMGPUPrefetchSharedMemoryPass (iree-llvmgpu-prefetch-shared-memory) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%thread_id_x = gpu.thread_id x | |
%alloc = memref.alloc() : memref<128x68xf16, #gpu.address_space<workgroup>> | |
%subview = memref.subview %alloc[0, 0] [128, 64] [1, 1] : memref<128x68xf16, #gpu.address_space<workgroup>> to memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
%alloc_3 = memref.alloc() : memref<64x132xf16, #gpu.address_space<workgroup>> | |
%subview_4 = memref.subview %alloc_3[0, 0] [64, 128] [1, 1] : memref<64x132xf16, #gpu.address_space<workgroup>> to memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%subview_5 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) { | |
gpu.barrier | |
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x] | |
%13 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x] | |
%14 = vector.transfer_read %0[%12, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x] | |
%16 = vector.transfer_read %0[%15, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x] | |
%18 = vector.transfer_read %0[%17, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x] | |
%20 = vector.transfer_read %0[%19, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x] | |
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x] | |
%23 = vector.transfer_read %1[%21, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x] | |
%25 = vector.transfer_read %1[%24, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x] | |
%27 = vector.transfer_read %1[%26, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x] | |
%29 = vector.transfer_read %1[%28, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x] | |
%31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %14, %subview_4[%30, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x] | |
vector.transfer_write %16, %subview_4[%32, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%33 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x] | |
vector.transfer_write %18, %subview_4[%33, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x] | |
vector.transfer_write %20, %subview_4[%34, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x] | |
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %23, %subview[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x] | |
vector.transfer_write %25, %subview[%37, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x] | |
vector.transfer_write %27, %subview[%38, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
%39 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x] | |
vector.transfer_write %29, %subview[%39, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%42 = vector.transfer_read %subview_4[%40, %41], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%43 = vector.insert_strided_slice %42, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%45 = vector.transfer_read %subview_4[%40, %44], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%46 = vector.insert_strided_slice %45, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%48 = vector.transfer_read %subview_4[%40, %47], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%49 = vector.insert_strided_slice %48, %46 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%51 = vector.transfer_read %subview_4[%40, %50], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%52 = vector.insert_strided_slice %51, %49 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%54 = vector.transfer_read %subview_4[%40, %53], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%55 = vector.insert_strided_slice %54, %52 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%56 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%57 = vector.transfer_read %subview_4[%40, %56], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%60 = vector.transfer_read %subview_4[%40, %59], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%63 = vector.transfer_read %subview_4[%40, %62], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%66 = vector.transfer_read %subview_4[%65, %41], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%67 = vector.insert_strided_slice %66, %64 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%68 = vector.transfer_read %subview_4[%65, %44], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%69 = vector.insert_strided_slice %68, %67 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%70 = vector.transfer_read %subview_4[%65, %47], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%71 = vector.insert_strided_slice %70, %69 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%72 = vector.transfer_read %subview_4[%65, %50], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%73 = vector.insert_strided_slice %72, %71 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%74 = vector.transfer_read %subview_4[%65, %53], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%75 = vector.insert_strided_slice %74, %73 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%76 = vector.transfer_read %subview_4[%65, %56], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%77 = vector.insert_strided_slice %76, %75 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%78 = vector.transfer_read %subview_4[%65, %59], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%79 = vector.insert_strided_slice %78, %77 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%80 = vector.transfer_read %subview_4[%65, %62], %cst_2 {in_bounds = [true, true]} : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%81 = vector.insert_strided_slice %80, %79 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%83 = vector.transfer_read %subview[%41, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%84 = vector.insert_strided_slice %83, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%86 = vector.transfer_read %subview[%41, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%87 = vector.insert_strided_slice %86, %84 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%88 = vector.transfer_read %subview[%44, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%89 = vector.insert_strided_slice %88, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%90 = vector.transfer_read %subview[%44, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%91 = vector.insert_strided_slice %90, %89 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%92 = vector.transfer_read %subview[%47, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%93 = vector.insert_strided_slice %92, %91 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%94 = vector.transfer_read %subview[%47, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%95 = vector.insert_strided_slice %94, %93 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%96 = vector.transfer_read %subview[%50, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%97 = vector.insert_strided_slice %96, %95 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%98 = vector.transfer_read %subview[%50, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%99 = vector.insert_strided_slice %98, %97 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%100 = vector.transfer_read %subview[%53, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%101 = vector.insert_strided_slice %100, %99 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%102 = vector.transfer_read %subview[%53, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%103 = vector.insert_strided_slice %102, %101 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%104 = vector.transfer_read %subview[%56, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%105 = vector.insert_strided_slice %104, %103 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%106 = vector.transfer_read %subview[%56, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%107 = vector.insert_strided_slice %106, %105 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%108 = vector.transfer_read %subview[%59, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%109 = vector.insert_strided_slice %108, %107 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%110 = vector.transfer_read %subview[%59, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%111 = vector.insert_strided_slice %110, %109 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%112 = vector.transfer_read %subview[%62, %82], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%113 = vector.insert_strided_slice %112, %111 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%114 = vector.transfer_read %subview[%62, %85], %cst_2 {in_bounds = [true, true]} : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%115 = vector.insert_strided_slice %114, %113 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%116 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%117 = vector.extract %81[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%118 = vector.extract %115[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%119 = vector.shape_cast %117 : vector<1x1x1x4xf16> to vector<4xf16> | |
%120 = vector.shape_cast %118 : vector<1x1x4x1xf16> to vector<4xf16> | |
%121 = vector.shape_cast %116 : vector<1x1x4x1xf32> to vector<4xf32> | |
%122 = amdgpu.mfma %119 * %120 + %121 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%123 = vector.extract %81[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%124 = vector.extract %115[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%125 = vector.shape_cast %123 : vector<1x1x1x4xf16> to vector<4xf16> | |
%126 = vector.shape_cast %124 : vector<1x1x4x1xf16> to vector<4xf16> | |
%127 = amdgpu.mfma %125 * %126 + %122 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%128 = vector.extract %81[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%129 = vector.extract %115[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%130 = vector.shape_cast %128 : vector<1x1x1x4xf16> to vector<4xf16> | |
%131 = vector.shape_cast %129 : vector<1x1x4x1xf16> to vector<4xf16> | |
%132 = amdgpu.mfma %130 * %131 + %127 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%133 = vector.extract %81[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%134 = vector.extract %115[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%135 = vector.shape_cast %133 : vector<1x1x1x4xf16> to vector<4xf16> | |
%136 = vector.shape_cast %134 : vector<1x1x4x1xf16> to vector<4xf16> | |
%137 = amdgpu.mfma %135 * %136 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%138 = vector.extract %81[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%139 = vector.extract %115[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%140 = vector.shape_cast %138 : vector<1x1x1x4xf16> to vector<4xf16> | |
%141 = vector.shape_cast %139 : vector<1x1x4x1xf16> to vector<4xf16> | |
%142 = amdgpu.mfma %140 * %141 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%143 = vector.extract %81[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%144 = vector.extract %115[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%145 = vector.shape_cast %143 : vector<1x1x1x4xf16> to vector<4xf16> | |
%146 = vector.shape_cast %144 : vector<1x1x4x1xf16> to vector<4xf16> | |
%147 = amdgpu.mfma %145 * %146 + %142 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%148 = vector.extract %81[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%149 = vector.extract %115[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%150 = vector.shape_cast %148 : vector<1x1x1x4xf16> to vector<4xf16> | |
%151 = vector.shape_cast %149 : vector<1x1x4x1xf16> to vector<4xf16> | |
%152 = amdgpu.mfma %150 * %151 + %147 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%153 = vector.extract %81[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%154 = vector.extract %115[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%155 = vector.shape_cast %153 : vector<1x1x1x4xf16> to vector<4xf16> | |
%156 = vector.shape_cast %154 : vector<1x1x4x1xf16> to vector<4xf16> | |
%157 = amdgpu.mfma %155 * %156 + %152 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%158 = vector.shape_cast %157 : vector<4xf32> to vector<1x1x4x1xf32> | |
%159 = vector.insert %158, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%160 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%161 = vector.extract %115[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%162 = vector.shape_cast %161 : vector<1x1x4x1xf16> to vector<4xf16> | |
%163 = vector.shape_cast %160 : vector<1x1x4x1xf32> to vector<4xf32> | |
%164 = amdgpu.mfma %119 * %162 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%165 = vector.extract %115[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%166 = vector.shape_cast %165 : vector<1x1x4x1xf16> to vector<4xf16> | |
%167 = amdgpu.mfma %125 * %166 + %164 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%168 = vector.extract %115[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%169 = vector.shape_cast %168 : vector<1x1x4x1xf16> to vector<4xf16> | |
%170 = amdgpu.mfma %130 * %169 + %167 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%171 = vector.extract %115[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%172 = vector.shape_cast %171 : vector<1x1x4x1xf16> to vector<4xf16> | |
%173 = amdgpu.mfma %135 * %172 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%174 = vector.extract %115[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%175 = vector.shape_cast %174 : vector<1x1x4x1xf16> to vector<4xf16> | |
%176 = amdgpu.mfma %140 * %175 + %173 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%177 = vector.extract %115[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%178 = vector.shape_cast %177 : vector<1x1x4x1xf16> to vector<4xf16> | |
%179 = amdgpu.mfma %145 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%180 = vector.extract %115[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%181 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16> | |
%182 = amdgpu.mfma %150 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%183 = vector.extract %115[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%184 = vector.shape_cast %183 : vector<1x1x4x1xf16> to vector<4xf16> | |
%185 = amdgpu.mfma %155 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x4x1xf32> | |
%187 = vector.insert %186, %159 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%188 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%189 = vector.extract %81[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%190 = vector.shape_cast %189 : vector<1x1x1x4xf16> to vector<4xf16> | |
%191 = vector.shape_cast %188 : vector<1x1x4x1xf32> to vector<4xf32> | |
%192 = amdgpu.mfma %190 * %120 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%193 = vector.extract %81[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%194 = vector.shape_cast %193 : vector<1x1x1x4xf16> to vector<4xf16> | |
%195 = amdgpu.mfma %194 * %126 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%196 = vector.extract %81[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%197 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16> | |
%198 = amdgpu.mfma %197 * %131 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%199 = vector.extract %81[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%200 = vector.shape_cast %199 : vector<1x1x1x4xf16> to vector<4xf16> | |
%201 = amdgpu.mfma %200 * %136 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%202 = vector.extract %81[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%203 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16> | |
%204 = amdgpu.mfma %203 * %141 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%205 = vector.extract %81[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%206 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16> | |
%207 = amdgpu.mfma %206 * %146 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%208 = vector.extract %81[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%209 = vector.shape_cast %208 : vector<1x1x1x4xf16> to vector<4xf16> | |
%210 = amdgpu.mfma %209 * %151 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%211 = vector.extract %81[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%212 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16> | |
%213 = amdgpu.mfma %212 * %156 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%214 = vector.shape_cast %213 : vector<4xf32> to vector<1x1x4x1xf32> | |
%215 = vector.insert %214, %187 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%216 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%217 = vector.shape_cast %216 : vector<1x1x4x1xf32> to vector<4xf32> | |
%218 = amdgpu.mfma %190 * %162 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%219 = amdgpu.mfma %194 * %166 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%220 = amdgpu.mfma %197 * %169 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%221 = amdgpu.mfma %200 * %172 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%222 = amdgpu.mfma %203 * %175 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%223 = amdgpu.mfma %206 * %178 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%224 = amdgpu.mfma %209 * %181 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%225 = amdgpu.mfma %212 * %184 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32> | |
%227 = vector.insert %226, %215 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
scf.yield %227 : vector<2x2x1x1x4x1xf32> | |
} | |
%4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%6 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %6, %subview_5[%4, %5] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%7 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%8 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %8, %subview_5[%4, %7] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%9 = affine.apply affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%10 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %10, %subview_5[%9, %5] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %11, %subview_5[%9, %7] {in_bounds = [true, true]} : vector<4x1xf32>, memref<64x64xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.dealloc %subview_4 : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
memref.dealloc %subview : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
return | |
} | |
// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%thread_id_x = gpu.thread_id x | |
%alloc = memref.alloc() : memref<128x68xf16, #gpu.address_space<workgroup>> | |
%subview = memref.subview %alloc[0, 0] [128, 64] [1, 1] : memref<128x68xf16, #gpu.address_space<workgroup>> to memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
%alloc_3 = memref.alloc() : memref<64x132xf16, #gpu.address_space<workgroup>> | |
%subview_4 = memref.subview %alloc_3[0, 0] [64, 128] [1, 1] : memref<64x132xf16, #gpu.address_space<workgroup>> to memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) { | |
gpu.barrier | |
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x] | |
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x] | |
%18 = vector.transfer_read %0[%16, %17], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x] | |
%20 = vector.transfer_read %0[%19, %17], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x] | |
%22 = vector.transfer_read %0[%21, %17], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x] | |
%24 = vector.transfer_read %0[%23, %17], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%25 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x] | |
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x] | |
%27 = vector.transfer_read %1[%25, %26], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x] | |
%29 = vector.transfer_read %1[%28, %26], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x] | |
%31 = vector.transfer_read %1[%30, %26], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%32 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x] | |
%33 = vector.transfer_read %1[%32, %26], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x] | |
%35 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %18, %alloc_3[%34, %35] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%36 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x] | |
%37 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %20, %alloc_3[%36, %37] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x] | |
%39 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %22, %alloc_3[%38, %39] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%40 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x] | |
%41 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %24, %alloc_3[%40, %41] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%42 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x] | |
%43 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %27, %alloc[%42, %43] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x] | |
%45 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %29, %alloc[%44, %45] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%46 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x] | |
%47 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %31, %alloc[%46, %47] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%48 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x] | |
%49 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %33, %alloc[%48, %49] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%50 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%51 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%52 = vector.transfer_read %alloc_3[%50, %51], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%53 = vector.insert_strided_slice %52, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%54 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%55 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%56 = vector.transfer_read %alloc_3[%54, %55], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%57 = vector.insert_strided_slice %56, %53 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%58 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%60 = vector.transfer_read %alloc_3[%58, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%61 = vector.insert_strided_slice %60, %57 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%62 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%63 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%64 = vector.transfer_read %alloc_3[%62, %63], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%65 = vector.insert_strided_slice %64, %61 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%66 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%67 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%68 = vector.transfer_read %alloc_3[%66, %67], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%69 = vector.insert_strided_slice %68, %65 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%70 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%71 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%72 = vector.transfer_read %alloc_3[%70, %71], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%73 = vector.insert_strided_slice %72, %69 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%74 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%75 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%76 = vector.transfer_read %alloc_3[%74, %75], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%77 = vector.insert_strided_slice %76, %73 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%78 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%79 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%80 = vector.transfer_read %alloc_3[%78, %79], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%81 = vector.insert_strided_slice %80, %77 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%83 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%84 = vector.transfer_read %alloc_3[%82, %83], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%85 = vector.insert_strided_slice %84, %81 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%86 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%87 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%88 = vector.transfer_read %alloc_3[%86, %87], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%89 = vector.insert_strided_slice %88, %85 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%90 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%91 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%92 = vector.transfer_read %alloc_3[%90, %91], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%93 = vector.insert_strided_slice %92, %89 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%94 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%95 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%96 = vector.transfer_read %alloc_3[%94, %95], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%97 = vector.insert_strided_slice %96, %93 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%98 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%99 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%100 = vector.transfer_read %alloc_3[%98, %99], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%101 = vector.insert_strided_slice %100, %97 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%102 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%103 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%104 = vector.transfer_read %alloc_3[%102, %103], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%105 = vector.insert_strided_slice %104, %101 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%106 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%107 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%108 = vector.transfer_read %alloc_3[%106, %107], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%109 = vector.insert_strided_slice %108, %105 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%110 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%111 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%112 = vector.transfer_read %alloc_3[%110, %111], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%113 = vector.insert_strided_slice %112, %109 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%114 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%115 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%116 = vector.transfer_read %alloc[%114, %115], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%117 = vector.insert_strided_slice %116, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%118 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%119 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%120 = vector.transfer_read %alloc[%118, %119], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%121 = vector.insert_strided_slice %120, %117 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%122 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%123 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%124 = vector.transfer_read %alloc[%122, %123], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%125 = vector.insert_strided_slice %124, %121 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%126 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%127 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%128 = vector.transfer_read %alloc[%126, %127], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%129 = vector.insert_strided_slice %128, %125 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%130 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%131 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%132 = vector.transfer_read %alloc[%130, %131], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%133 = vector.insert_strided_slice %132, %129 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%134 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%135 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%136 = vector.transfer_read %alloc[%134, %135], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%137 = vector.insert_strided_slice %136, %133 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%138 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%139 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%140 = vector.transfer_read %alloc[%138, %139], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%141 = vector.insert_strided_slice %140, %137 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%142 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%143 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%144 = vector.transfer_read %alloc[%142, %143], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%145 = vector.insert_strided_slice %144, %141 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%146 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%147 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%148 = vector.transfer_read %alloc[%146, %147], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%149 = vector.insert_strided_slice %148, %145 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%150 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%151 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%152 = vector.transfer_read %alloc[%150, %151], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%153 = vector.insert_strided_slice %152, %149 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%154 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%155 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%156 = vector.transfer_read %alloc[%154, %155], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%157 = vector.insert_strided_slice %156, %153 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%158 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%159 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%160 = vector.transfer_read %alloc[%158, %159], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%161 = vector.insert_strided_slice %160, %157 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%162 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%163 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%164 = vector.transfer_read %alloc[%162, %163], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%165 = vector.insert_strided_slice %164, %161 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%166 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%167 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%168 = vector.transfer_read %alloc[%166, %167], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%169 = vector.insert_strided_slice %168, %165 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%170 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%171 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%172 = vector.transfer_read %alloc[%170, %171], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%173 = vector.insert_strided_slice %172, %169 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%174 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%175 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%176 = vector.transfer_read %alloc[%174, %175], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%177 = vector.insert_strided_slice %176, %173 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%178 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%179 = vector.extract %113[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%180 = vector.extract %177[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%181 = vector.shape_cast %179 : vector<1x1x1x4xf16> to vector<4xf16> | |
%182 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16> | |
%183 = vector.shape_cast %178 : vector<1x1x4x1xf32> to vector<4xf32> | |
%184 = amdgpu.mfma %181 * %182 + %183 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%185 = vector.extract %113[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%186 = vector.extract %177[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%187 = vector.shape_cast %185 : vector<1x1x1x4xf16> to vector<4xf16> | |
%188 = vector.shape_cast %186 : vector<1x1x4x1xf16> to vector<4xf16> | |
%189 = amdgpu.mfma %187 * %188 + %184 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%190 = vector.extract %113[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%191 = vector.extract %177[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%192 = vector.shape_cast %190 : vector<1x1x1x4xf16> to vector<4xf16> | |
%193 = vector.shape_cast %191 : vector<1x1x4x1xf16> to vector<4xf16> | |
%194 = amdgpu.mfma %192 * %193 + %189 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%195 = vector.extract %113[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%196 = vector.extract %177[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%197 = vector.shape_cast %195 : vector<1x1x1x4xf16> to vector<4xf16> | |
%198 = vector.shape_cast %196 : vector<1x1x4x1xf16> to vector<4xf16> | |
%199 = amdgpu.mfma %197 * %198 + %194 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%200 = vector.extract %113[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%201 = vector.extract %177[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%202 = vector.shape_cast %200 : vector<1x1x1x4xf16> to vector<4xf16> | |
%203 = vector.shape_cast %201 : vector<1x1x4x1xf16> to vector<4xf16> | |
%204 = amdgpu.mfma %202 * %203 + %199 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%205 = vector.extract %113[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%206 = vector.extract %177[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%207 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16> | |
%208 = vector.shape_cast %206 : vector<1x1x4x1xf16> to vector<4xf16> | |
%209 = amdgpu.mfma %207 * %208 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%210 = vector.extract %113[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%211 = vector.extract %177[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%212 = vector.shape_cast %210 : vector<1x1x1x4xf16> to vector<4xf16> | |
%213 = vector.shape_cast %211 : vector<1x1x4x1xf16> to vector<4xf16> | |
%214 = amdgpu.mfma %212 * %213 + %209 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%215 = vector.extract %113[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%216 = vector.extract %177[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%217 = vector.shape_cast %215 : vector<1x1x1x4xf16> to vector<4xf16> | |
%218 = vector.shape_cast %216 : vector<1x1x4x1xf16> to vector<4xf16> | |
%219 = amdgpu.mfma %217 * %218 + %214 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%220 = vector.shape_cast %219 : vector<4xf32> to vector<1x1x4x1xf32> | |
%221 = vector.insert %220, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%222 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%223 = vector.extract %177[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%224 = vector.shape_cast %223 : vector<1x1x4x1xf16> to vector<4xf16> | |
%225 = vector.shape_cast %222 : vector<1x1x4x1xf32> to vector<4xf32> | |
%226 = amdgpu.mfma %181 * %224 + %225 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%227 = vector.extract %177[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%228 = vector.shape_cast %227 : vector<1x1x4x1xf16> to vector<4xf16> | |
%229 = amdgpu.mfma %187 * %228 + %226 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%230 = vector.extract %177[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%231 = vector.shape_cast %230 : vector<1x1x4x1xf16> to vector<4xf16> | |
%232 = amdgpu.mfma %192 * %231 + %229 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%233 = vector.extract %177[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%234 = vector.shape_cast %233 : vector<1x1x4x1xf16> to vector<4xf16> | |
%235 = amdgpu.mfma %197 * %234 + %232 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%236 = vector.extract %177[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%237 = vector.shape_cast %236 : vector<1x1x4x1xf16> to vector<4xf16> | |
%238 = amdgpu.mfma %202 * %237 + %235 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%239 = vector.extract %177[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%240 = vector.shape_cast %239 : vector<1x1x4x1xf16> to vector<4xf16> | |
%241 = amdgpu.mfma %207 * %240 + %238 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%242 = vector.extract %177[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%243 = vector.shape_cast %242 : vector<1x1x4x1xf16> to vector<4xf16> | |
%244 = amdgpu.mfma %212 * %243 + %241 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%245 = vector.extract %177[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%246 = vector.shape_cast %245 : vector<1x1x4x1xf16> to vector<4xf16> | |
%247 = amdgpu.mfma %217 * %246 + %244 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%248 = vector.shape_cast %247 : vector<4xf32> to vector<1x1x4x1xf32> | |
%249 = vector.insert %248, %221 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%250 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%251 = vector.extract %113[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%252 = vector.shape_cast %251 : vector<1x1x1x4xf16> to vector<4xf16> | |
%253 = vector.shape_cast %250 : vector<1x1x4x1xf32> to vector<4xf32> | |
%254 = amdgpu.mfma %252 * %182 + %253 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%255 = vector.extract %113[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%256 = vector.shape_cast %255 : vector<1x1x1x4xf16> to vector<4xf16> | |
%257 = amdgpu.mfma %256 * %188 + %254 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%258 = vector.extract %113[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%259 = vector.shape_cast %258 : vector<1x1x1x4xf16> to vector<4xf16> | |
%260 = amdgpu.mfma %259 * %193 + %257 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%261 = vector.extract %113[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%262 = vector.shape_cast %261 : vector<1x1x1x4xf16> to vector<4xf16> | |
%263 = amdgpu.mfma %262 * %198 + %260 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%264 = vector.extract %113[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%265 = vector.shape_cast %264 : vector<1x1x1x4xf16> to vector<4xf16> | |
%266 = amdgpu.mfma %265 * %203 + %263 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%267 = vector.extract %113[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%268 = vector.shape_cast %267 : vector<1x1x1x4xf16> to vector<4xf16> | |
%269 = amdgpu.mfma %268 * %208 + %266 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%270 = vector.extract %113[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%271 = vector.shape_cast %270 : vector<1x1x1x4xf16> to vector<4xf16> | |
%272 = amdgpu.mfma %271 * %213 + %269 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%273 = vector.extract %113[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%274 = vector.shape_cast %273 : vector<1x1x1x4xf16> to vector<4xf16> | |
%275 = amdgpu.mfma %274 * %218 + %272 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%276 = vector.shape_cast %275 : vector<4xf32> to vector<1x1x4x1xf32> | |
%277 = vector.insert %276, %249 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%278 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%279 = vector.shape_cast %278 : vector<1x1x4x1xf32> to vector<4xf32> | |
%280 = amdgpu.mfma %252 * %224 + %279 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%281 = amdgpu.mfma %256 * %228 + %280 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%282 = amdgpu.mfma %259 * %231 + %281 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%283 = amdgpu.mfma %262 * %234 + %282 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%284 = amdgpu.mfma %265 * %237 + %283 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%285 = amdgpu.mfma %268 * %240 + %284 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%286 = amdgpu.mfma %271 * %243 + %285 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%287 = amdgpu.mfma %274 * %246 + %286 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%288 = vector.shape_cast %287 : vector<4xf32> to vector<1x1x4x1xf32> | |
%289 = vector.insert %288, %277 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
scf.yield %289 : vector<2x2x1x1x4x1xf32> | |
} | |
%4 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16)>()[%arg0, %thread_id_x] | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16)>()[%arg1, %thread_id_x] | |
vector.transfer_write %4, %2[%5, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%7 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16)>()[%arg0, %thread_id_x] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16 + 16)>()[%arg1, %thread_id_x] | |
vector.transfer_write %7, %2[%8, %9] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%10 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%11 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16 + 16)>()[%arg0, %thread_id_x] | |
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16)>()[%arg1, %thread_id_x] | |
vector.transfer_write %10, %2[%11, %12] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%13 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%14 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16 + 16)>()[%arg0, %thread_id_x] | |
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16 + 16)>()[%arg1, %thread_id_x] | |
vector.transfer_write %13, %2[%14, %15] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.dealloc %subview_4 : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
memref.dealloc %subview : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%thread_id_x = gpu.thread_id x | |
%alloc = memref.alloc() : memref<128x68xf16, #gpu.address_space<workgroup>> | |
%subview = memref.subview %alloc[0, 0] [128, 64] [1, 1] : memref<128x68xf16, #gpu.address_space<workgroup>> to memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
%alloc_3 = memref.alloc() : memref<64x132xf16, #gpu.address_space<workgroup>> | |
%subview_4 = memref.subview %alloc_3[0, 0] [64, 128] [1, 1] : memref<64x132xf16, #gpu.address_space<workgroup>> to memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) { | |
gpu.barrier | |
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x] | |
%13 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x] | |
%14 = vector.transfer_read %0[%12, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x] | |
%16 = vector.transfer_read %0[%15, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x] | |
%18 = vector.transfer_read %0[%17, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x] | |
%20 = vector.transfer_read %0[%19, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x] | |
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x] | |
%23 = vector.transfer_read %1[%21, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x] | |
%25 = vector.transfer_read %1[%24, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x] | |
%27 = vector.transfer_read %1[%26, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x] | |
%29 = vector.transfer_read %1[%28, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x] | |
%31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %14, %alloc_3[%30, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x] | |
vector.transfer_write %16, %alloc_3[%32, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%33 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x] | |
vector.transfer_write %18, %alloc_3[%33, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x] | |
vector.transfer_write %20, %alloc_3[%34, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x] | |
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %23, %alloc[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x] | |
vector.transfer_write %25, %alloc[%37, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x] | |
vector.transfer_write %27, %alloc[%38, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%39 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x] | |
vector.transfer_write %29, %alloc[%39, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%42 = vector.transfer_read %alloc_3[%40, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%43 = vector.insert_strided_slice %42, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%45 = vector.transfer_read %alloc_3[%40, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%46 = vector.insert_strided_slice %45, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%48 = vector.transfer_read %alloc_3[%40, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%49 = vector.insert_strided_slice %48, %46 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%51 = vector.transfer_read %alloc_3[%40, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%52 = vector.insert_strided_slice %51, %49 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%54 = vector.transfer_read %alloc_3[%40, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%55 = vector.insert_strided_slice %54, %52 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%56 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%57 = vector.transfer_read %alloc_3[%40, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%60 = vector.transfer_read %alloc_3[%40, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%63 = vector.transfer_read %alloc_3[%40, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%66 = vector.transfer_read %alloc_3[%65, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%67 = vector.insert_strided_slice %66, %64 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%68 = vector.transfer_read %alloc_3[%65, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%69 = vector.insert_strided_slice %68, %67 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%70 = vector.transfer_read %alloc_3[%65, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%71 = vector.insert_strided_slice %70, %69 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%72 = vector.transfer_read %alloc_3[%65, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%73 = vector.insert_strided_slice %72, %71 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%74 = vector.transfer_read %alloc_3[%65, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%75 = vector.insert_strided_slice %74, %73 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%76 = vector.transfer_read %alloc_3[%65, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%77 = vector.insert_strided_slice %76, %75 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%78 = vector.transfer_read %alloc_3[%65, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%79 = vector.insert_strided_slice %78, %77 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%80 = vector.transfer_read %alloc_3[%65, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%81 = vector.insert_strided_slice %80, %79 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%83 = vector.transfer_read %alloc[%41, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%84 = vector.insert_strided_slice %83, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%86 = vector.transfer_read %alloc[%41, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%87 = vector.insert_strided_slice %86, %84 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%88 = vector.transfer_read %alloc[%44, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%89 = vector.insert_strided_slice %88, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%90 = vector.transfer_read %alloc[%44, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%91 = vector.insert_strided_slice %90, %89 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%92 = vector.transfer_read %alloc[%47, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%93 = vector.insert_strided_slice %92, %91 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%94 = vector.transfer_read %alloc[%47, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%95 = vector.insert_strided_slice %94, %93 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%96 = vector.transfer_read %alloc[%50, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%97 = vector.insert_strided_slice %96, %95 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%98 = vector.transfer_read %alloc[%50, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%99 = vector.insert_strided_slice %98, %97 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%100 = vector.transfer_read %alloc[%53, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%101 = vector.insert_strided_slice %100, %99 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%102 = vector.transfer_read %alloc[%53, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%103 = vector.insert_strided_slice %102, %101 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%104 = vector.transfer_read %alloc[%56, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%105 = vector.insert_strided_slice %104, %103 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%106 = vector.transfer_read %alloc[%56, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%107 = vector.insert_strided_slice %106, %105 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%108 = vector.transfer_read %alloc[%59, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%109 = vector.insert_strided_slice %108, %107 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%110 = vector.transfer_read %alloc[%59, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%111 = vector.insert_strided_slice %110, %109 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%112 = vector.transfer_read %alloc[%62, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%113 = vector.insert_strided_slice %112, %111 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%114 = vector.transfer_read %alloc[%62, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%115 = vector.insert_strided_slice %114, %113 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%116 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%117 = vector.extract %81[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%118 = vector.extract %115[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%119 = vector.shape_cast %117 : vector<1x1x1x4xf16> to vector<4xf16> | |
%120 = vector.shape_cast %118 : vector<1x1x4x1xf16> to vector<4xf16> | |
%121 = vector.shape_cast %116 : vector<1x1x4x1xf32> to vector<4xf32> | |
%122 = amdgpu.mfma %119 * %120 + %121 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%123 = vector.extract %81[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%124 = vector.extract %115[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%125 = vector.shape_cast %123 : vector<1x1x1x4xf16> to vector<4xf16> | |
%126 = vector.shape_cast %124 : vector<1x1x4x1xf16> to vector<4xf16> | |
%127 = amdgpu.mfma %125 * %126 + %122 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%128 = vector.extract %81[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%129 = vector.extract %115[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%130 = vector.shape_cast %128 : vector<1x1x1x4xf16> to vector<4xf16> | |
%131 = vector.shape_cast %129 : vector<1x1x4x1xf16> to vector<4xf16> | |
%132 = amdgpu.mfma %130 * %131 + %127 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%133 = vector.extract %81[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%134 = vector.extract %115[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%135 = vector.shape_cast %133 : vector<1x1x1x4xf16> to vector<4xf16> | |
%136 = vector.shape_cast %134 : vector<1x1x4x1xf16> to vector<4xf16> | |
%137 = amdgpu.mfma %135 * %136 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%138 = vector.extract %81[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%139 = vector.extract %115[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%140 = vector.shape_cast %138 : vector<1x1x1x4xf16> to vector<4xf16> | |
%141 = vector.shape_cast %139 : vector<1x1x4x1xf16> to vector<4xf16> | |
%142 = amdgpu.mfma %140 * %141 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%143 = vector.extract %81[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%144 = vector.extract %115[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%145 = vector.shape_cast %143 : vector<1x1x1x4xf16> to vector<4xf16> | |
%146 = vector.shape_cast %144 : vector<1x1x4x1xf16> to vector<4xf16> | |
%147 = amdgpu.mfma %145 * %146 + %142 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%148 = vector.extract %81[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%149 = vector.extract %115[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%150 = vector.shape_cast %148 : vector<1x1x1x4xf16> to vector<4xf16> | |
%151 = vector.shape_cast %149 : vector<1x1x4x1xf16> to vector<4xf16> | |
%152 = amdgpu.mfma %150 * %151 + %147 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%153 = vector.extract %81[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%154 = vector.extract %115[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%155 = vector.shape_cast %153 : vector<1x1x1x4xf16> to vector<4xf16> | |
%156 = vector.shape_cast %154 : vector<1x1x4x1xf16> to vector<4xf16> | |
%157 = amdgpu.mfma %155 * %156 + %152 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%158 = vector.shape_cast %157 : vector<4xf32> to vector<1x1x4x1xf32> | |
%159 = vector.insert %158, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%160 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%161 = vector.extract %115[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%162 = vector.shape_cast %161 : vector<1x1x4x1xf16> to vector<4xf16> | |
%163 = vector.shape_cast %160 : vector<1x1x4x1xf32> to vector<4xf32> | |
%164 = amdgpu.mfma %119 * %162 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%165 = vector.extract %115[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%166 = vector.shape_cast %165 : vector<1x1x4x1xf16> to vector<4xf16> | |
%167 = amdgpu.mfma %125 * %166 + %164 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%168 = vector.extract %115[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%169 = vector.shape_cast %168 : vector<1x1x4x1xf16> to vector<4xf16> | |
%170 = amdgpu.mfma %130 * %169 + %167 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%171 = vector.extract %115[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%172 = vector.shape_cast %171 : vector<1x1x4x1xf16> to vector<4xf16> | |
%173 = amdgpu.mfma %135 * %172 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%174 = vector.extract %115[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%175 = vector.shape_cast %174 : vector<1x1x4x1xf16> to vector<4xf16> | |
%176 = amdgpu.mfma %140 * %175 + %173 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%177 = vector.extract %115[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%178 = vector.shape_cast %177 : vector<1x1x4x1xf16> to vector<4xf16> | |
%179 = amdgpu.mfma %145 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%180 = vector.extract %115[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%181 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16> | |
%182 = amdgpu.mfma %150 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%183 = vector.extract %115[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%184 = vector.shape_cast %183 : vector<1x1x4x1xf16> to vector<4xf16> | |
%185 = amdgpu.mfma %155 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x4x1xf32> | |
%187 = vector.insert %186, %159 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%188 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%189 = vector.extract %81[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%190 = vector.shape_cast %189 : vector<1x1x1x4xf16> to vector<4xf16> | |
%191 = vector.shape_cast %188 : vector<1x1x4x1xf32> to vector<4xf32> | |
%192 = amdgpu.mfma %190 * %120 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%193 = vector.extract %81[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%194 = vector.shape_cast %193 : vector<1x1x1x4xf16> to vector<4xf16> | |
%195 = amdgpu.mfma %194 * %126 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%196 = vector.extract %81[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%197 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16> | |
%198 = amdgpu.mfma %197 * %131 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%199 = vector.extract %81[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%200 = vector.shape_cast %199 : vector<1x1x1x4xf16> to vector<4xf16> | |
%201 = amdgpu.mfma %200 * %136 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%202 = vector.extract %81[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%203 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16> | |
%204 = amdgpu.mfma %203 * %141 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%205 = vector.extract %81[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%206 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16> | |
%207 = amdgpu.mfma %206 * %146 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%208 = vector.extract %81[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%209 = vector.shape_cast %208 : vector<1x1x1x4xf16> to vector<4xf16> | |
%210 = amdgpu.mfma %209 * %151 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%211 = vector.extract %81[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%212 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16> | |
%213 = amdgpu.mfma %212 * %156 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%214 = vector.shape_cast %213 : vector<4xf32> to vector<1x1x4x1xf32> | |
%215 = vector.insert %214, %187 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%216 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%217 = vector.shape_cast %216 : vector<1x1x4x1xf32> to vector<4xf32> | |
%218 = amdgpu.mfma %190 * %162 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%219 = amdgpu.mfma %194 * %166 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%220 = amdgpu.mfma %197 * %169 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%221 = amdgpu.mfma %200 * %172 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%222 = amdgpu.mfma %203 * %175 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%223 = amdgpu.mfma %206 * %178 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%224 = amdgpu.mfma %209 * %181 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%225 = amdgpu.mfma %212 * %184 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32> | |
%227 = vector.insert %226, %215 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
scf.yield %227 : vector<2x2x1x1x4x1xf32> | |
} | |
%4 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16)>()[%arg0, %thread_id_x] | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16)>()[%arg1, %thread_id_x] | |
vector.transfer_write %4, %2[%5, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%7 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16 + 16)>()[%arg1, %thread_id_x] | |
vector.transfer_write %7, %2[%5, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%9 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16 + 16)>()[%arg0, %thread_id_x] | |
vector.transfer_write %9, %2[%10, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %11, %2[%10, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.dealloc %subview_4 : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
memref.dealloc %subview : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%thread_id_x = gpu.thread_id x | |
%alloc = memref.alloc() : memref<128x68xf16, #gpu.address_space<workgroup>> | |
%subview = memref.subview %alloc[0, 0] [128, 64] [1, 1] : memref<128x68xf16, #gpu.address_space<workgroup>> to memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
%alloc_3 = memref.alloc() : memref<64x132xf16, #gpu.address_space<workgroup>> | |
%subview_4 = memref.subview %alloc_3[0, 0] [64, 128] [1, 1] : memref<64x132xf16, #gpu.address_space<workgroup>> to memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) { | |
gpu.barrier | |
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x] | |
%13 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x] | |
%14 = vector.transfer_read %0[%12, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x] | |
%16 = vector.transfer_read %0[%15, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x] | |
%18 = vector.transfer_read %0[%17, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x] | |
%20 = vector.transfer_read %0[%19, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x] | |
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x] | |
%23 = vector.transfer_read %1[%21, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x] | |
%25 = vector.transfer_read %1[%24, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x] | |
%27 = vector.transfer_read %1[%26, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x] | |
%29 = vector.transfer_read %1[%28, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x] | |
%31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %14, %alloc_3[%30, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x] | |
vector.transfer_write %16, %alloc_3[%32, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%33 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x] | |
vector.transfer_write %18, %alloc_3[%33, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x] | |
vector.transfer_write %20, %alloc_3[%34, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x] | |
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %23, %alloc[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x] | |
vector.transfer_write %25, %alloc[%37, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x] | |
vector.transfer_write %27, %alloc[%38, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%39 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x] | |
vector.transfer_write %29, %alloc[%39, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%42 = vector.transfer_read %alloc_3[%40, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%43 = vector.insert_strided_slice %42, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%45 = vector.transfer_read %alloc_3[%40, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%46 = vector.insert_strided_slice %45, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%48 = vector.transfer_read %alloc_3[%40, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%49 = vector.insert_strided_slice %48, %46 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%51 = vector.transfer_read %alloc_3[%40, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%52 = vector.insert_strided_slice %51, %49 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%54 = vector.transfer_read %alloc_3[%40, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%55 = vector.insert_strided_slice %54, %52 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%56 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%57 = vector.transfer_read %alloc_3[%40, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%60 = vector.transfer_read %alloc_3[%40, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%63 = vector.transfer_read %alloc_3[%40, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%66 = vector.transfer_read %alloc_3[%65, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%67 = vector.insert_strided_slice %66, %64 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%68 = vector.transfer_read %alloc_3[%65, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%69 = vector.insert_strided_slice %68, %67 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%70 = vector.transfer_read %alloc_3[%65, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%71 = vector.insert_strided_slice %70, %69 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%72 = vector.transfer_read %alloc_3[%65, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%73 = vector.insert_strided_slice %72, %71 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%74 = vector.transfer_read %alloc_3[%65, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%75 = vector.insert_strided_slice %74, %73 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%76 = vector.transfer_read %alloc_3[%65, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%77 = vector.insert_strided_slice %76, %75 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%78 = vector.transfer_read %alloc_3[%65, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%79 = vector.insert_strided_slice %78, %77 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%80 = vector.transfer_read %alloc_3[%65, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%81 = vector.insert_strided_slice %80, %79 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%83 = vector.transfer_read %alloc[%41, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%84 = vector.insert_strided_slice %83, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%86 = vector.transfer_read %alloc[%41, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%87 = vector.insert_strided_slice %86, %84 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%88 = vector.transfer_read %alloc[%44, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%89 = vector.insert_strided_slice %88, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%90 = vector.transfer_read %alloc[%44, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%91 = vector.insert_strided_slice %90, %89 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%92 = vector.transfer_read %alloc[%47, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%93 = vector.insert_strided_slice %92, %91 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%94 = vector.transfer_read %alloc[%47, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%95 = vector.insert_strided_slice %94, %93 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%96 = vector.transfer_read %alloc[%50, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%97 = vector.insert_strided_slice %96, %95 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%98 = vector.transfer_read %alloc[%50, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%99 = vector.insert_strided_slice %98, %97 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%100 = vector.transfer_read %alloc[%53, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%101 = vector.insert_strided_slice %100, %99 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%102 = vector.transfer_read %alloc[%53, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%103 = vector.insert_strided_slice %102, %101 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%104 = vector.transfer_read %alloc[%56, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%105 = vector.insert_strided_slice %104, %103 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%106 = vector.transfer_read %alloc[%56, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%107 = vector.insert_strided_slice %106, %105 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%108 = vector.transfer_read %alloc[%59, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%109 = vector.insert_strided_slice %108, %107 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%110 = vector.transfer_read %alloc[%59, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%111 = vector.insert_strided_slice %110, %109 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%112 = vector.transfer_read %alloc[%62, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%113 = vector.insert_strided_slice %112, %111 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%114 = vector.transfer_read %alloc[%62, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%115 = vector.insert_strided_slice %114, %113 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%116 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%117 = vector.extract %81[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%118 = vector.extract %115[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%119 = vector.shape_cast %117 : vector<1x1x1x4xf16> to vector<4xf16> | |
%120 = vector.shape_cast %118 : vector<1x1x4x1xf16> to vector<4xf16> | |
%121 = vector.shape_cast %116 : vector<1x1x4x1xf32> to vector<4xf32> | |
%122 = amdgpu.mfma %119 * %120 + %121 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%123 = vector.extract %81[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%124 = vector.extract %115[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%125 = vector.shape_cast %123 : vector<1x1x1x4xf16> to vector<4xf16> | |
%126 = vector.shape_cast %124 : vector<1x1x4x1xf16> to vector<4xf16> | |
%127 = amdgpu.mfma %125 * %126 + %122 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%128 = vector.extract %81[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%129 = vector.extract %115[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%130 = vector.shape_cast %128 : vector<1x1x1x4xf16> to vector<4xf16> | |
%131 = vector.shape_cast %129 : vector<1x1x4x1xf16> to vector<4xf16> | |
%132 = amdgpu.mfma %130 * %131 + %127 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%133 = vector.extract %81[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%134 = vector.extract %115[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%135 = vector.shape_cast %133 : vector<1x1x1x4xf16> to vector<4xf16> | |
%136 = vector.shape_cast %134 : vector<1x1x4x1xf16> to vector<4xf16> | |
%137 = amdgpu.mfma %135 * %136 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%138 = vector.extract %81[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%139 = vector.extract %115[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%140 = vector.shape_cast %138 : vector<1x1x1x4xf16> to vector<4xf16> | |
%141 = vector.shape_cast %139 : vector<1x1x4x1xf16> to vector<4xf16> | |
%142 = amdgpu.mfma %140 * %141 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%143 = vector.extract %81[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%144 = vector.extract %115[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%145 = vector.shape_cast %143 : vector<1x1x1x4xf16> to vector<4xf16> | |
%146 = vector.shape_cast %144 : vector<1x1x4x1xf16> to vector<4xf16> | |
%147 = amdgpu.mfma %145 * %146 + %142 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%148 = vector.extract %81[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%149 = vector.extract %115[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%150 = vector.shape_cast %148 : vector<1x1x1x4xf16> to vector<4xf16> | |
%151 = vector.shape_cast %149 : vector<1x1x4x1xf16> to vector<4xf16> | |
%152 = amdgpu.mfma %150 * %151 + %147 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%153 = vector.extract %81[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%154 = vector.extract %115[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%155 = vector.shape_cast %153 : vector<1x1x1x4xf16> to vector<4xf16> | |
%156 = vector.shape_cast %154 : vector<1x1x4x1xf16> to vector<4xf16> | |
%157 = amdgpu.mfma %155 * %156 + %152 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%158 = vector.shape_cast %157 : vector<4xf32> to vector<1x1x4x1xf32> | |
%159 = vector.insert %158, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%160 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%161 = vector.extract %115[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%162 = vector.shape_cast %161 : vector<1x1x4x1xf16> to vector<4xf16> | |
%163 = vector.shape_cast %160 : vector<1x1x4x1xf32> to vector<4xf32> | |
%164 = amdgpu.mfma %119 * %162 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%165 = vector.extract %115[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%166 = vector.shape_cast %165 : vector<1x1x4x1xf16> to vector<4xf16> | |
%167 = amdgpu.mfma %125 * %166 + %164 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%168 = vector.extract %115[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%169 = vector.shape_cast %168 : vector<1x1x4x1xf16> to vector<4xf16> | |
%170 = amdgpu.mfma %130 * %169 + %167 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%171 = vector.extract %115[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%172 = vector.shape_cast %171 : vector<1x1x4x1xf16> to vector<4xf16> | |
%173 = amdgpu.mfma %135 * %172 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%174 = vector.extract %115[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%175 = vector.shape_cast %174 : vector<1x1x4x1xf16> to vector<4xf16> | |
%176 = amdgpu.mfma %140 * %175 + %173 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%177 = vector.extract %115[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%178 = vector.shape_cast %177 : vector<1x1x4x1xf16> to vector<4xf16> | |
%179 = amdgpu.mfma %145 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%180 = vector.extract %115[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%181 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16> | |
%182 = amdgpu.mfma %150 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%183 = vector.extract %115[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%184 = vector.shape_cast %183 : vector<1x1x4x1xf16> to vector<4xf16> | |
%185 = amdgpu.mfma %155 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x4x1xf32> | |
%187 = vector.insert %186, %159 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%188 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%189 = vector.extract %81[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%190 = vector.shape_cast %189 : vector<1x1x1x4xf16> to vector<4xf16> | |
%191 = vector.shape_cast %188 : vector<1x1x4x1xf32> to vector<4xf32> | |
%192 = amdgpu.mfma %190 * %120 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%193 = vector.extract %81[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%194 = vector.shape_cast %193 : vector<1x1x1x4xf16> to vector<4xf16> | |
%195 = amdgpu.mfma %194 * %126 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%196 = vector.extract %81[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%197 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16> | |
%198 = amdgpu.mfma %197 * %131 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%199 = vector.extract %81[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%200 = vector.shape_cast %199 : vector<1x1x1x4xf16> to vector<4xf16> | |
%201 = amdgpu.mfma %200 * %136 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%202 = vector.extract %81[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%203 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16> | |
%204 = amdgpu.mfma %203 * %141 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%205 = vector.extract %81[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%206 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16> | |
%207 = amdgpu.mfma %206 * %146 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%208 = vector.extract %81[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%209 = vector.shape_cast %208 : vector<1x1x1x4xf16> to vector<4xf16> | |
%210 = amdgpu.mfma %209 * %151 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%211 = vector.extract %81[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%212 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16> | |
%213 = amdgpu.mfma %212 * %156 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%214 = vector.shape_cast %213 : vector<4xf32> to vector<1x1x4x1xf32> | |
%215 = vector.insert %214, %187 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%216 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%217 = vector.shape_cast %216 : vector<1x1x4x1xf32> to vector<4xf32> | |
%218 = amdgpu.mfma %190 * %162 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%219 = amdgpu.mfma %194 * %166 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%220 = amdgpu.mfma %197 * %169 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%221 = amdgpu.mfma %200 * %172 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%222 = amdgpu.mfma %203 * %175 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%223 = amdgpu.mfma %206 * %178 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%224 = amdgpu.mfma %209 * %181 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%225 = amdgpu.mfma %212 * %184 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32> | |
%227 = vector.insert %226, %215 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
scf.yield %227 : vector<2x2x1x1x4x1xf32> | |
} | |
%4 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16)>()[%arg0, %thread_id_x] | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16)>()[%arg1, %thread_id_x] | |
vector.transfer_write %4, %2[%5, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%7 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16 + 16)>()[%arg1, %thread_id_x] | |
vector.transfer_write %7, %2[%5, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%9 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16 + 16)>()[%arg0, %thread_id_x] | |
vector.transfer_write %9, %2[%10, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %11, %2[%10, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.dealloc %subview_4 : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
memref.dealloc %subview : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%thread_id_x = gpu.thread_id x | |
%alloc = memref.alloc() : memref<128x68xf16, #gpu.address_space<workgroup>> | |
%subview = memref.subview %alloc[0, 0] [128, 64] [1, 1] : memref<128x68xf16, #gpu.address_space<workgroup>> to memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
%alloc_3 = memref.alloc() : memref<64x132xf16, #gpu.address_space<workgroup>> | |
%subview_4 = memref.subview %alloc_3[0, 0] [64, 128] [1, 1] : memref<64x132xf16, #gpu.address_space<workgroup>> to memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) { | |
gpu.barrier | |
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x] | |
%13 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x] | |
%14 = vector.transfer_read %0[%12, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x] | |
%16 = vector.transfer_read %0[%15, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x] | |
%18 = vector.transfer_read %0[%17, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x] | |
%20 = vector.transfer_read %0[%19, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x] | |
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x] | |
%23 = vector.transfer_read %1[%21, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x] | |
%25 = vector.transfer_read %1[%24, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x] | |
%27 = vector.transfer_read %1[%26, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x] | |
%29 = vector.transfer_read %1[%28, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x] | |
%31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %14, %alloc_3[%30, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x] | |
vector.transfer_write %16, %alloc_3[%32, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%33 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x] | |
vector.transfer_write %18, %alloc_3[%33, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x] | |
vector.transfer_write %20, %alloc_3[%34, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x] | |
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %23, %alloc[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x] | |
vector.transfer_write %25, %alloc[%37, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x] | |
vector.transfer_write %27, %alloc[%38, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%39 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x] | |
vector.transfer_write %29, %alloc[%39, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%42 = vector.transfer_read %alloc_3[%40, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%43 = vector.insert_strided_slice %42, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%45 = vector.transfer_read %alloc_3[%40, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%46 = vector.insert_strided_slice %45, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%48 = vector.transfer_read %alloc_3[%40, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%49 = vector.insert_strided_slice %48, %46 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%51 = vector.transfer_read %alloc_3[%40, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%52 = vector.insert_strided_slice %51, %49 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%54 = vector.transfer_read %alloc_3[%40, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%55 = vector.insert_strided_slice %54, %52 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%56 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%57 = vector.transfer_read %alloc_3[%40, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%60 = vector.transfer_read %alloc_3[%40, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%63 = vector.transfer_read %alloc_3[%40, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%66 = vector.transfer_read %alloc_3[%65, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%67 = vector.insert_strided_slice %66, %64 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%68 = vector.transfer_read %alloc_3[%65, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%69 = vector.insert_strided_slice %68, %67 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%70 = vector.transfer_read %alloc_3[%65, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%71 = vector.insert_strided_slice %70, %69 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%72 = vector.transfer_read %alloc_3[%65, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%73 = vector.insert_strided_slice %72, %71 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%74 = vector.transfer_read %alloc_3[%65, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%75 = vector.insert_strided_slice %74, %73 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%76 = vector.transfer_read %alloc_3[%65, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%77 = vector.insert_strided_slice %76, %75 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%78 = vector.transfer_read %alloc_3[%65, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%79 = vector.insert_strided_slice %78, %77 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%80 = vector.transfer_read %alloc_3[%65, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%81 = vector.insert_strided_slice %80, %79 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%83 = vector.transfer_read %alloc[%41, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%84 = vector.insert_strided_slice %83, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%86 = vector.transfer_read %alloc[%41, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%87 = vector.insert_strided_slice %86, %84 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%88 = vector.transfer_read %alloc[%44, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%89 = vector.insert_strided_slice %88, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%90 = vector.transfer_read %alloc[%44, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%91 = vector.insert_strided_slice %90, %89 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%92 = vector.transfer_read %alloc[%47, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%93 = vector.insert_strided_slice %92, %91 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%94 = vector.transfer_read %alloc[%47, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%95 = vector.insert_strided_slice %94, %93 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%96 = vector.transfer_read %alloc[%50, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%97 = vector.insert_strided_slice %96, %95 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%98 = vector.transfer_read %alloc[%50, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%99 = vector.insert_strided_slice %98, %97 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%100 = vector.transfer_read %alloc[%53, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%101 = vector.insert_strided_slice %100, %99 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%102 = vector.transfer_read %alloc[%53, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%103 = vector.insert_strided_slice %102, %101 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%104 = vector.transfer_read %alloc[%56, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%105 = vector.insert_strided_slice %104, %103 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%106 = vector.transfer_read %alloc[%56, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%107 = vector.insert_strided_slice %106, %105 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%108 = vector.transfer_read %alloc[%59, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%109 = vector.insert_strided_slice %108, %107 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%110 = vector.transfer_read %alloc[%59, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%111 = vector.insert_strided_slice %110, %109 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%112 = vector.transfer_read %alloc[%62, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%113 = vector.insert_strided_slice %112, %111 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%114 = vector.transfer_read %alloc[%62, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%115 = vector.insert_strided_slice %114, %113 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%116 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%117 = vector.extract %81[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%118 = vector.extract %115[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%119 = vector.shape_cast %117 : vector<1x1x1x4xf16> to vector<4xf16> | |
%120 = vector.shape_cast %118 : vector<1x1x4x1xf16> to vector<4xf16> | |
%121 = vector.shape_cast %116 : vector<1x1x4x1xf32> to vector<4xf32> | |
%122 = amdgpu.mfma %119 * %120 + %121 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%123 = vector.extract %81[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%124 = vector.extract %115[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%125 = vector.shape_cast %123 : vector<1x1x1x4xf16> to vector<4xf16> | |
%126 = vector.shape_cast %124 : vector<1x1x4x1xf16> to vector<4xf16> | |
%127 = amdgpu.mfma %125 * %126 + %122 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%128 = vector.extract %81[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%129 = vector.extract %115[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%130 = vector.shape_cast %128 : vector<1x1x1x4xf16> to vector<4xf16> | |
%131 = vector.shape_cast %129 : vector<1x1x4x1xf16> to vector<4xf16> | |
%132 = amdgpu.mfma %130 * %131 + %127 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%133 = vector.extract %81[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%134 = vector.extract %115[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%135 = vector.shape_cast %133 : vector<1x1x1x4xf16> to vector<4xf16> | |
%136 = vector.shape_cast %134 : vector<1x1x4x1xf16> to vector<4xf16> | |
%137 = amdgpu.mfma %135 * %136 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%138 = vector.extract %81[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%139 = vector.extract %115[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%140 = vector.shape_cast %138 : vector<1x1x1x4xf16> to vector<4xf16> | |
%141 = vector.shape_cast %139 : vector<1x1x4x1xf16> to vector<4xf16> | |
%142 = amdgpu.mfma %140 * %141 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%143 = vector.extract %81[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%144 = vector.extract %115[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%145 = vector.shape_cast %143 : vector<1x1x1x4xf16> to vector<4xf16> | |
%146 = vector.shape_cast %144 : vector<1x1x4x1xf16> to vector<4xf16> | |
%147 = amdgpu.mfma %145 * %146 + %142 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%148 = vector.extract %81[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%149 = vector.extract %115[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%150 = vector.shape_cast %148 : vector<1x1x1x4xf16> to vector<4xf16> | |
%151 = vector.shape_cast %149 : vector<1x1x4x1xf16> to vector<4xf16> | |
%152 = amdgpu.mfma %150 * %151 + %147 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%153 = vector.extract %81[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%154 = vector.extract %115[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%155 = vector.shape_cast %153 : vector<1x1x1x4xf16> to vector<4xf16> | |
%156 = vector.shape_cast %154 : vector<1x1x4x1xf16> to vector<4xf16> | |
%157 = amdgpu.mfma %155 * %156 + %152 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%158 = vector.shape_cast %157 : vector<4xf32> to vector<1x1x4x1xf32> | |
%159 = vector.insert %158, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%160 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%161 = vector.extract %115[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%162 = vector.shape_cast %161 : vector<1x1x4x1xf16> to vector<4xf16> | |
%163 = vector.shape_cast %160 : vector<1x1x4x1xf32> to vector<4xf32> | |
%164 = amdgpu.mfma %119 * %162 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%165 = vector.extract %115[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%166 = vector.shape_cast %165 : vector<1x1x4x1xf16> to vector<4xf16> | |
%167 = amdgpu.mfma %125 * %166 + %164 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%168 = vector.extract %115[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%169 = vector.shape_cast %168 : vector<1x1x4x1xf16> to vector<4xf16> | |
%170 = amdgpu.mfma %130 * %169 + %167 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%171 = vector.extract %115[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%172 = vector.shape_cast %171 : vector<1x1x4x1xf16> to vector<4xf16> | |
%173 = amdgpu.mfma %135 * %172 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%174 = vector.extract %115[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%175 = vector.shape_cast %174 : vector<1x1x4x1xf16> to vector<4xf16> | |
%176 = amdgpu.mfma %140 * %175 + %173 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%177 = vector.extract %115[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%178 = vector.shape_cast %177 : vector<1x1x4x1xf16> to vector<4xf16> | |
%179 = amdgpu.mfma %145 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%180 = vector.extract %115[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%181 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16> | |
%182 = amdgpu.mfma %150 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%183 = vector.extract %115[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%184 = vector.shape_cast %183 : vector<1x1x4x1xf16> to vector<4xf16> | |
%185 = amdgpu.mfma %155 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x4x1xf32> | |
%187 = vector.insert %186, %159 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%188 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%189 = vector.extract %81[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%190 = vector.shape_cast %189 : vector<1x1x1x4xf16> to vector<4xf16> | |
%191 = vector.shape_cast %188 : vector<1x1x4x1xf32> to vector<4xf32> | |
%192 = amdgpu.mfma %190 * %120 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%193 = vector.extract %81[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%194 = vector.shape_cast %193 : vector<1x1x1x4xf16> to vector<4xf16> | |
%195 = amdgpu.mfma %194 * %126 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%196 = vector.extract %81[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%197 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16> | |
%198 = amdgpu.mfma %197 * %131 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%199 = vector.extract %81[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%200 = vector.shape_cast %199 : vector<1x1x1x4xf16> to vector<4xf16> | |
%201 = amdgpu.mfma %200 * %136 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%202 = vector.extract %81[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%203 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16> | |
%204 = amdgpu.mfma %203 * %141 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%205 = vector.extract %81[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%206 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16> | |
%207 = amdgpu.mfma %206 * %146 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%208 = vector.extract %81[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%209 = vector.shape_cast %208 : vector<1x1x1x4xf16> to vector<4xf16> | |
%210 = amdgpu.mfma %209 * %151 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%211 = vector.extract %81[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%212 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16> | |
%213 = amdgpu.mfma %212 * %156 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%214 = vector.shape_cast %213 : vector<4xf32> to vector<1x1x4x1xf32> | |
%215 = vector.insert %214, %187 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%216 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%217 = vector.shape_cast %216 : vector<1x1x4x1xf32> to vector<4xf32> | |
%218 = amdgpu.mfma %190 * %162 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%219 = amdgpu.mfma %194 * %166 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%220 = amdgpu.mfma %197 * %169 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%221 = amdgpu.mfma %200 * %172 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%222 = amdgpu.mfma %203 * %175 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%223 = amdgpu.mfma %206 * %178 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%224 = amdgpu.mfma %209 * %181 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%225 = amdgpu.mfma %212 * %184 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32> | |
%227 = vector.insert %226, %215 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
scf.yield %227 : vector<2x2x1x1x4x1xf32> | |
} | |
%4 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16)>()[%arg0, %thread_id_x] | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16)>()[%arg1, %thread_id_x] | |
vector.transfer_write %4, %2[%5, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%7 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16 + 16)>()[%arg1, %thread_id_x] | |
vector.transfer_write %7, %2[%5, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%9 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16 + 16)>()[%arg0, %thread_id_x] | |
vector.transfer_write %9, %2[%10, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %11, %2[%10, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.dealloc %subview_4 : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
memref.dealloc %subview : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
return | |
} | |
// -----// IR Dump After LLVMGPULowerExecutableTargetPass (iree-llvmgpu-lower-executable-target) //----- // | |
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x2x1x1x4x1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<2x8x1x1x1x4xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<2x2x1x1x4x1xf32> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
%c0 = arith.constant 0 : index | |
%thread_id_x = gpu.thread_id x | |
%alloc = memref.alloc() : memref<128x68xf16, #gpu.address_space<workgroup>> | |
%subview = memref.subview %alloc[0, 0] [128, 64] [1, 1] : memref<128x68xf16, #gpu.address_space<workgroup>> to memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
%alloc_3 = memref.alloc() : memref<64x132xf16, #gpu.address_space<workgroup>> | |
%subview_4 = memref.subview %alloc_3[0, 0] [64, 128] [1, 1] : memref<64x132xf16, #gpu.address_space<workgroup>> to memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x256xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (256, 256) step (64, 64) { | |
%3 = scf.for %arg2 = %c0 to %c256 step %c128 iter_args(%arg3 = %cst_1) -> (vector<2x2x1x1x4x1xf32>) { | |
gpu.barrier | |
%12 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16)>()[%arg0, %thread_id_x] | |
%13 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg2, %thread_id_x] | |
%14 = vector.transfer_read %0[%12, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 16)>()[%arg0, %thread_id_x] | |
%16 = vector.transfer_read %0[%15, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 32)>()[%arg0, %thread_id_x] | |
%18 = vector.transfer_read %0[%17, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 16) * 16 + 48)>()[%arg0, %thread_id_x] | |
%20 = vector.transfer_read %0[%19, %13], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32)>()[%arg2, %thread_id_x] | |
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 8) * 64)>()[%arg1, %thread_id_x] | |
%23 = vector.transfer_read %1[%21, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 32)>()[%arg2, %thread_id_x] | |
%25 = vector.transfer_read %1[%24, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 64)>()[%arg2, %thread_id_x] | |
%27 = vector.transfer_read %1[%26, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 8 - ((s1 floordiv 8) floordiv 32) * 32 + 96)>()[%arg2, %thread_id_x] | |
%29 = vector.transfer_read %1[%28, %22], %cst_2 {in_bounds = [true, true]} : memref<256x256xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16)>()[%thread_id_x] | |
%31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 16) * 128)>()[%thread_id_x] | |
vector.transfer_write %14, %alloc_3[%30, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%32 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 16)>()[%thread_id_x] | |
vector.transfer_write %16, %alloc_3[%32, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%33 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 32)>()[%thread_id_x] | |
vector.transfer_write %18, %alloc_3[%33, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 16 + 48)>()[%thread_id_x] | |
vector.transfer_write %20, %alloc_3[%34, %31] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x132xf16, #gpu.address_space<workgroup>> | |
%35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32)>()[%thread_id_x] | |
%36 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] | |
vector.transfer_write %23, %alloc[%35, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%37 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 32)>()[%thread_id_x] | |
vector.transfer_write %25, %alloc[%37, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 64)>()[%thread_id_x] | |
vector.transfer_write %27, %alloc[%38, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
%39 = affine.apply affine_map<()[s0] -> ((s0 floordiv 8) mod 32 + 96)>()[%thread_id_x] | |
vector.transfer_write %29, %alloc[%39, %36] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x68xf16, #gpu.address_space<workgroup>> | |
gpu.barrier | |
%40 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%41 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)>()[%thread_id_x] | |
%42 = vector.transfer_read %alloc_3[%40, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%43 = vector.insert_strided_slice %42, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%44 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 16)>()[%thread_id_x] | |
%45 = vector.transfer_read %alloc_3[%40, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%46 = vector.insert_strided_slice %45, %43 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%47 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 32)>()[%thread_id_x] | |
%48 = vector.transfer_read %alloc_3[%40, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%49 = vector.insert_strided_slice %48, %46 {offsets = [0, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%50 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 48)>()[%thread_id_x] | |
%51 = vector.transfer_read %alloc_3[%40, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%52 = vector.insert_strided_slice %51, %49 {offsets = [0, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%53 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 64)>()[%thread_id_x] | |
%54 = vector.transfer_read %alloc_3[%40, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%55 = vector.insert_strided_slice %54, %52 {offsets = [0, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%56 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 80)>()[%thread_id_x] | |
%57 = vector.transfer_read %alloc_3[%40, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%58 = vector.insert_strided_slice %57, %55 {offsets = [0, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%59 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 96)>()[%thread_id_x] | |
%60 = vector.transfer_read %alloc_3[%40, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%61 = vector.insert_strided_slice %60, %58 {offsets = [0, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%62 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16 + 112)>()[%thread_id_x] | |
%63 = vector.transfer_read %alloc_3[%40, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%64 = vector.insert_strided_slice %63, %61 {offsets = [0, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%65 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%66 = vector.transfer_read %alloc_3[%65, %41], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%67 = vector.insert_strided_slice %66, %64 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%68 = vector.transfer_read %alloc_3[%65, %44], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%69 = vector.insert_strided_slice %68, %67 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%70 = vector.transfer_read %alloc_3[%65, %47], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%71 = vector.insert_strided_slice %70, %69 {offsets = [1, 2, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%72 = vector.transfer_read %alloc_3[%65, %50], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%73 = vector.insert_strided_slice %72, %71 {offsets = [1, 3, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%74 = vector.transfer_read %alloc_3[%65, %53], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%75 = vector.insert_strided_slice %74, %73 {offsets = [1, 4, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%76 = vector.transfer_read %alloc_3[%65, %56], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%77 = vector.insert_strided_slice %76, %75 {offsets = [1, 5, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%78 = vector.transfer_read %alloc_3[%65, %59], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%79 = vector.insert_strided_slice %78, %77 {offsets = [1, 6, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%80 = vector.transfer_read %alloc_3[%65, %62], %cst_2 {in_bounds = [true, true]} : memref<64x132xf16, #gpu.address_space<workgroup>>, vector<1x4xf16> | |
%81 = vector.insert_strided_slice %80, %79 {offsets = [1, 7, 0, 0, 0, 0], strides = [1, 1]} : vector<1x4xf16> into vector<2x8x1x1x1x4xf16> | |
%82 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16)>()[%thread_id_x] | |
%83 = vector.transfer_read %alloc[%41, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%84 = vector.insert_strided_slice %83, %cst {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%85 = affine.apply affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 16) * 16 + 16)>()[%thread_id_x] | |
%86 = vector.transfer_read %alloc[%41, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%87 = vector.insert_strided_slice %86, %84 {offsets = [0, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%88 = vector.transfer_read %alloc[%44, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%89 = vector.insert_strided_slice %88, %87 {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%90 = vector.transfer_read %alloc[%44, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%91 = vector.insert_strided_slice %90, %89 {offsets = [1, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%92 = vector.transfer_read %alloc[%47, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%93 = vector.insert_strided_slice %92, %91 {offsets = [2, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%94 = vector.transfer_read %alloc[%47, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%95 = vector.insert_strided_slice %94, %93 {offsets = [2, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%96 = vector.transfer_read %alloc[%50, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%97 = vector.insert_strided_slice %96, %95 {offsets = [3, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%98 = vector.transfer_read %alloc[%50, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%99 = vector.insert_strided_slice %98, %97 {offsets = [3, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%100 = vector.transfer_read %alloc[%53, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%101 = vector.insert_strided_slice %100, %99 {offsets = [4, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%102 = vector.transfer_read %alloc[%53, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%103 = vector.insert_strided_slice %102, %101 {offsets = [4, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%104 = vector.transfer_read %alloc[%56, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%105 = vector.insert_strided_slice %104, %103 {offsets = [5, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%106 = vector.transfer_read %alloc[%56, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%107 = vector.insert_strided_slice %106, %105 {offsets = [5, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%108 = vector.transfer_read %alloc[%59, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%109 = vector.insert_strided_slice %108, %107 {offsets = [6, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%110 = vector.transfer_read %alloc[%59, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%111 = vector.insert_strided_slice %110, %109 {offsets = [6, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%112 = vector.transfer_read %alloc[%62, %82], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%113 = vector.insert_strided_slice %112, %111 {offsets = [7, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%114 = vector.transfer_read %alloc[%62, %85], %cst_2 {in_bounds = [true, true]} : memref<128x68xf16, #gpu.address_space<workgroup>>, vector<4x1xf16> | |
%115 = vector.insert_strided_slice %114, %113 {offsets = [7, 1, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<8x2x1x1x4x1xf16> | |
%116 = vector.extract %arg3[0, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%117 = vector.extract %81[0, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%118 = vector.extract %115[0, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%119 = vector.shape_cast %117 : vector<1x1x1x4xf16> to vector<4xf16> | |
%120 = vector.shape_cast %118 : vector<1x1x4x1xf16> to vector<4xf16> | |
%121 = vector.shape_cast %116 : vector<1x1x4x1xf32> to vector<4xf32> | |
%122 = amdgpu.mfma %119 * %120 + %121 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%123 = vector.extract %81[0, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%124 = vector.extract %115[1, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%125 = vector.shape_cast %123 : vector<1x1x1x4xf16> to vector<4xf16> | |
%126 = vector.shape_cast %124 : vector<1x1x4x1xf16> to vector<4xf16> | |
%127 = amdgpu.mfma %125 * %126 + %122 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%128 = vector.extract %81[0, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%129 = vector.extract %115[2, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%130 = vector.shape_cast %128 : vector<1x1x1x4xf16> to vector<4xf16> | |
%131 = vector.shape_cast %129 : vector<1x1x4x1xf16> to vector<4xf16> | |
%132 = amdgpu.mfma %130 * %131 + %127 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%133 = vector.extract %81[0, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%134 = vector.extract %115[3, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%135 = vector.shape_cast %133 : vector<1x1x1x4xf16> to vector<4xf16> | |
%136 = vector.shape_cast %134 : vector<1x1x4x1xf16> to vector<4xf16> | |
%137 = amdgpu.mfma %135 * %136 + %132 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%138 = vector.extract %81[0, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%139 = vector.extract %115[4, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%140 = vector.shape_cast %138 : vector<1x1x1x4xf16> to vector<4xf16> | |
%141 = vector.shape_cast %139 : vector<1x1x4x1xf16> to vector<4xf16> | |
%142 = amdgpu.mfma %140 * %141 + %137 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%143 = vector.extract %81[0, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%144 = vector.extract %115[5, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%145 = vector.shape_cast %143 : vector<1x1x1x4xf16> to vector<4xf16> | |
%146 = vector.shape_cast %144 : vector<1x1x4x1xf16> to vector<4xf16> | |
%147 = amdgpu.mfma %145 * %146 + %142 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%148 = vector.extract %81[0, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%149 = vector.extract %115[6, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%150 = vector.shape_cast %148 : vector<1x1x1x4xf16> to vector<4xf16> | |
%151 = vector.shape_cast %149 : vector<1x1x4x1xf16> to vector<4xf16> | |
%152 = amdgpu.mfma %150 * %151 + %147 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%153 = vector.extract %81[0, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%154 = vector.extract %115[7, 0] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%155 = vector.shape_cast %153 : vector<1x1x1x4xf16> to vector<4xf16> | |
%156 = vector.shape_cast %154 : vector<1x1x4x1xf16> to vector<4xf16> | |
%157 = amdgpu.mfma %155 * %156 + %152 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%158 = vector.shape_cast %157 : vector<4xf32> to vector<1x1x4x1xf32> | |
%159 = vector.insert %158, %cst_1 [0, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%160 = vector.extract %arg3[0, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%161 = vector.extract %115[0, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%162 = vector.shape_cast %161 : vector<1x1x4x1xf16> to vector<4xf16> | |
%163 = vector.shape_cast %160 : vector<1x1x4x1xf32> to vector<4xf32> | |
%164 = amdgpu.mfma %119 * %162 + %163 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%165 = vector.extract %115[1, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%166 = vector.shape_cast %165 : vector<1x1x4x1xf16> to vector<4xf16> | |
%167 = amdgpu.mfma %125 * %166 + %164 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%168 = vector.extract %115[2, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%169 = vector.shape_cast %168 : vector<1x1x4x1xf16> to vector<4xf16> | |
%170 = amdgpu.mfma %130 * %169 + %167 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%171 = vector.extract %115[3, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%172 = vector.shape_cast %171 : vector<1x1x4x1xf16> to vector<4xf16> | |
%173 = amdgpu.mfma %135 * %172 + %170 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%174 = vector.extract %115[4, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%175 = vector.shape_cast %174 : vector<1x1x4x1xf16> to vector<4xf16> | |
%176 = amdgpu.mfma %140 * %175 + %173 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%177 = vector.extract %115[5, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%178 = vector.shape_cast %177 : vector<1x1x4x1xf16> to vector<4xf16> | |
%179 = amdgpu.mfma %145 * %178 + %176 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%180 = vector.extract %115[6, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%181 = vector.shape_cast %180 : vector<1x1x4x1xf16> to vector<4xf16> | |
%182 = amdgpu.mfma %150 * %181 + %179 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%183 = vector.extract %115[7, 1] : vector<1x1x4x1xf16> from vector<8x2x1x1x4x1xf16> | |
%184 = vector.shape_cast %183 : vector<1x1x4x1xf16> to vector<4xf16> | |
%185 = amdgpu.mfma %155 * %184 + %182 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%186 = vector.shape_cast %185 : vector<4xf32> to vector<1x1x4x1xf32> | |
%187 = vector.insert %186, %159 [0, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%188 = vector.extract %arg3[1, 0] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%189 = vector.extract %81[1, 0] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%190 = vector.shape_cast %189 : vector<1x1x1x4xf16> to vector<4xf16> | |
%191 = vector.shape_cast %188 : vector<1x1x4x1xf32> to vector<4xf32> | |
%192 = amdgpu.mfma %190 * %120 + %191 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%193 = vector.extract %81[1, 1] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%194 = vector.shape_cast %193 : vector<1x1x1x4xf16> to vector<4xf16> | |
%195 = amdgpu.mfma %194 * %126 + %192 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%196 = vector.extract %81[1, 2] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%197 = vector.shape_cast %196 : vector<1x1x1x4xf16> to vector<4xf16> | |
%198 = amdgpu.mfma %197 * %131 + %195 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%199 = vector.extract %81[1, 3] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%200 = vector.shape_cast %199 : vector<1x1x1x4xf16> to vector<4xf16> | |
%201 = amdgpu.mfma %200 * %136 + %198 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%202 = vector.extract %81[1, 4] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%203 = vector.shape_cast %202 : vector<1x1x1x4xf16> to vector<4xf16> | |
%204 = amdgpu.mfma %203 * %141 + %201 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%205 = vector.extract %81[1, 5] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%206 = vector.shape_cast %205 : vector<1x1x1x4xf16> to vector<4xf16> | |
%207 = amdgpu.mfma %206 * %146 + %204 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%208 = vector.extract %81[1, 6] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%209 = vector.shape_cast %208 : vector<1x1x1x4xf16> to vector<4xf16> | |
%210 = amdgpu.mfma %209 * %151 + %207 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%211 = vector.extract %81[1, 7] : vector<1x1x1x4xf16> from vector<2x8x1x1x1x4xf16> | |
%212 = vector.shape_cast %211 : vector<1x1x1x4xf16> to vector<4xf16> | |
%213 = amdgpu.mfma %212 * %156 + %210 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%214 = vector.shape_cast %213 : vector<4xf32> to vector<1x1x4x1xf32> | |
%215 = vector.insert %214, %187 [1, 0] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
%216 = vector.extract %arg3[1, 1] : vector<1x1x4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%217 = vector.shape_cast %216 : vector<1x1x4x1xf32> to vector<4xf32> | |
%218 = amdgpu.mfma %190 * %162 + %217 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%219 = amdgpu.mfma %194 * %166 + %218 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%220 = amdgpu.mfma %197 * %169 + %219 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%221 = amdgpu.mfma %200 * %172 + %220 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%222 = amdgpu.mfma %203 * %175 + %221 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%223 = amdgpu.mfma %206 * %178 + %222 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%224 = amdgpu.mfma %209 * %181 + %223 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%225 = amdgpu.mfma %212 * %184 + %224 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> | |
%226 = vector.shape_cast %225 : vector<4xf32> to vector<1x1x4x1xf32> | |
%227 = vector.insert %226, %215 [1, 1] : vector<1x1x4x1xf32> into vector<2x2x1x1x4x1xf32> | |
scf.yield %227 : vector<2x2x1x1x4x1xf32> | |
} | |
%4 = vector.extract %3[0, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16)>()[%arg0, %thread_id_x] | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16)>()[%arg1, %thread_id_x] | |
vector.transfer_write %4, %2[%5, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%7 = vector.extract %3[0, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + (s1 floordiv 64) * 32 - ((s1 floordiv 64) floordiv 2) * 64 - (s1 floordiv 16) * 16 + 16)>()[%arg1, %thread_id_x] | |
vector.transfer_write %7, %2[%5, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%9 = vector.extract %3[1, 0, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 128) * 32 - ((s1 floordiv 128) floordiv 2) * 64 + (s1 floordiv 16) * 4 - ((s1 floordiv 16) floordiv 4) * 16 + 16)>()[%arg0, %thread_id_x] | |
vector.transfer_write %9, %2[%10, %6] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
%11 = vector.extract %3[1, 1, 0, 0] : vector<4x1xf32> from vector<2x2x1x1x4x1xf32> | |
vector.transfer_write %11, %2[%10, %8] {in_bounds = [true, true]} : vector<4x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} | |
memref.dealloc %subview_4 : memref<64x128xf16, strided<[132, 1]>, #gpu.address_space<workgroup>> | |
memref.dealloc %subview : memref<128x64xf16, strided<[68, 1]>, #gpu.address_space<workgroup>> | |
return | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment