Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created January 13, 2025 08:04
Show Gist options
  • Save pashu123/4bc26c278e34cfd094c87fde18ba9dc8 to your computer and use it in GitHub Desktop.
Save pashu123/4bc26c278e34cfd094c87fde18ba9dc8 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
// -----// IR Dump After ConvolutionToIGEMMPass (iree-codegen-convolution-to-igemm) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x17x17x1281xf32>) -> tensor<2x17x17x1281xf32>
%6 = tensor.empty() : tensor<2x17x17x11529xf16>
%7 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [0, 0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%6 : tensor<2x17x17x11529xf16>) -> tensor<2x17x17x11529xf16>
%8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%7, %8 : tensor<2x17x17x11529xf16>, tensor<11529x1281xf16>) outs(%5 : tensor<2x17x17x1281xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} {
^bb0(%in: f16, %in_0: f16, %out: f32):
%10 = arith.extf %in : f16 to f32
%11 = arith.extf %in_0 : f16 to f32
%12 = arith.mulf %10, %11 : f32
%13 = arith.addf %12, %out : f32
linalg.yield %13 : f32
} -> tensor<2x17x17x1281xf32>
flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After ConvertAccGEMMToGEMMPass (iree-convert-accgemm-to-gemm) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x17x17x1281xf32>) -> tensor<2x17x17x1281xf32>
%6 = tensor.empty() : tensor<2x17x17x11529xf16>
%7 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [0, 0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%6 : tensor<2x17x17x11529xf16>) -> tensor<2x17x17x11529xf16>
%8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%7, %8 : tensor<2x17x17x11529xf16>, tensor<11529x1281xf16>) outs(%5 : tensor<2x17x17x1281xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} {
^bb0(%in: f16, %in_0: f16, %out: f32):
%10 = arith.extf %in : f16 to f32
%11 = arith.extf %in_0 : f16 to f32
%12 = arith.mulf %10, %11 : f32
%13 = arith.addf %12, %out : f32
linalg.yield %13 : f32
} -> tensor<2x17x17x1281xf32>
flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c17 = arith.constant 17 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%extracted_slice_0 = tensor.extract_slice %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
%10 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%9, %extracted_slice : tensor<2x1x17x11529xf16>, tensor<11529x?xf16>) outs(%10 : tensor<2x1x17x?xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} {
^bb0(%in: f16, %in_1: f16, %out: f32):
%12 = arith.extf %in : f16 to f32
%13 = arith.extf %in_1 : f16 to f32
%14 = arith.mulf %12, %13 : f32
%15 = arith.addf %14, %out : f32
linalg.yield %15 : f32
} -> tensor<2x1x17x?xf32>
%cast = tensor.cast %11 : tensor<2x1x17x?xf32> to tensor<2x1x?x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %cast into %arg2[%c0, %arg0, %c0, %arg1] [2, 1, %c17, %7] [1, 1, 1, 1] : tensor<2x1x?x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%extracted_slice_0 = tensor.extract_slice %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
%10 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%9, %extracted_slice : tensor<2x1x17x11529xf16>, tensor<11529x?xf16>) outs(%10 : tensor<2x1x17x?xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} {
^bb0(%in: f16, %in_1: f16, %out: f32):
%12 = arith.extf %in : f16 to f32
%13 = arith.extf %in_1 : f16 to f32
%14 = arith.mulf %12, %13 : f32
%15 = arith.addf %14, %out : f32
linalg.yield %15 : f32
} -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %11 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%extracted_slice_0 = tensor.extract_slice %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
%10 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%9, %extracted_slice : tensor<2x1x17x11529xf16>, tensor<11529x?xf16>) outs(%10 : tensor<2x1x17x?xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} {
^bb0(%in: f16, %in_1: f16, %out: f32):
%12 = arith.extf %in : f16 to f32
%13 = arith.extf %in_1 : f16 to f32
%14 = arith.mulf %12, %13 : f32
%15 = arith.addf %14, %out : f32
linalg.yield %15 : f32
} -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %11 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After GPUPadOperandsPass (iree-codegen-gpu-pad-operands) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%extracted_slice_0 = tensor.extract_slice %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
%10 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
%cst_1 = arith.constant 0.000000e+00 : f16
%padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst_1 : f16
} : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
%cst_2 = arith.constant 0.000000e+00 : f16
%c1 = arith.constant 1 : index
%11 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
%padded_3 = tensor.pad %extracted_slice low[0, 0] high[7, %11] {
^bb0(%arg3: index, %arg4: index):
tensor.yield %cst_2 : f16
} : tensor<11529x?xf16> to tensor<11536x16xf16>
%cst_4 = arith.constant 0.000000e+00 : f32
%c3 = arith.constant 3 : index
%dim = tensor.dim %10, %c3 : tensor<2x1x17x?xf32>
%12 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%dim)
%padded_5 = tensor.pad %10 low[0, 0, 0, 0] high[0, 0, 15, %12] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst_4 : f32
} : tensor<2x1x17x?xf32> to tensor<2x1x32x16xf32>
%c1_6 = arith.constant 1 : index
%c3_7 = arith.constant 3 : index
%dim_8 = tensor.dim %10, %c3_7 : tensor<2x1x17x?xf32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%padded, %padded_3 : tensor<2x1x32x11536xf16>, tensor<11536x16xf16>) outs(%padded_5 : tensor<2x1x32x16xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} {
^bb0(%in: f16, %in_10: f16, %out: f32):
%14 = arith.extf %in : f16 to f32
%15 = arith.extf %in_10 : f16 to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %16, %out : f32
linalg.yield %17 : f32
} -> tensor<2x1x32x16xf32>
%extracted_slice_9 = tensor.extract_slice %13[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %extracted_slice_9 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After GPUPromoteMatmulOperandsPass (iree-codegen-gpu-promote-matmul-operands) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%extracted_slice_0 = tensor.extract_slice %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
%10 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
%cst_1 = arith.constant 0.000000e+00 : f16
%padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst_1 : f16
} : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
%cst_2 = arith.constant 0.000000e+00 : f16
%c1 = arith.constant 1 : index
%11 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
%padded_3 = tensor.pad %extracted_slice low[0, 0] high[7, %11] {
^bb0(%arg3: index, %arg4: index):
tensor.yield %cst_2 : f16
} : tensor<11529x?xf16> to tensor<11536x16xf16>
%cst_4 = arith.constant 0.000000e+00 : f32
%c3 = arith.constant 3 : index
%dim = tensor.dim %10, %c3 : tensor<2x1x17x?xf32>
%12 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%dim)
%padded_5 = tensor.pad %10 low[0, 0, 0, 0] high[0, 0, 15, %12] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst_4 : f32
} : tensor<2x1x17x?xf32> to tensor<2x1x32x16xf32>
%c1_6 = arith.constant 1 : index
%c3_7 = arith.constant 3 : index
%dim_8 = tensor.dim %10, %c3_7 : tensor<2x1x17x?xf32>
%13 = tensor.empty() : tensor<2x1x32x11536xf16>
%14 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded : tensor<2x1x32x11536xf16>) outs(%13 : tensor<2x1x32x11536xf16>) -> tensor<2x1x32x11536xf16>
%15 = tensor.empty() : tensor<11536x16xf16>
%16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded_3 : tensor<11536x16xf16>) outs(%15 : tensor<11536x16xf16>) -> tensor<11536x16xf16>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%14, %16 : tensor<2x1x32x11536xf16>, tensor<11536x16xf16>) outs(%padded_5 : tensor<2x1x32x16xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} {
^bb0(%in: f16, %in_11: f16, %out: f32):
%22 = arith.extf %in : f16 to f32
%23 = arith.extf %in_11 : f16 to f32
%24 = arith.mulf %22, %23 : f32
%25 = arith.addf %24, %out : f32
linalg.yield %25 : f32
} -> tensor<2x1x32x16xf32>
%18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%19 = linalg.copy ins(%17 : tensor<2x1x32x16xf32>) outs(%18 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%extracted_slice_9 = tensor.extract_slice %19[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
%c3_10 = arith.constant 3 : index
%20 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_9 : tensor<2x1x17x?xf32>) outs(%20 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %21 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After GPUPackToIntrinsicsPass (iree-codegen-gpu-pack-to-intrinsics) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c3 = arith.constant 3 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%extracted_slice_1 = tensor.extract_slice %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
%10 = linalg.fill ins(%cst_0 : f32) outs(%extracted_slice_1 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
%padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f16
} : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
%11 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
%padded_2 = tensor.pad %extracted_slice low[0, 0] high[7, %11] {
^bb0(%arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<11529x?xf16> to tensor<11536x16xf16>
%dim = tensor.dim %10, %c3 : tensor<2x1x17x?xf32>
%12 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%dim)
%padded_3 = tensor.pad %10 low[0, 0, 0, 0] high[0, 0, 15, %12] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst_0 : f32
} : tensor<2x1x17x?xf32> to tensor<2x1x32x16xf32>
%13 = tensor.empty() : tensor<2x1x32x11536xf16>
%14 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded : tensor<2x1x32x11536xf16>) outs(%13 : tensor<2x1x32x11536xf16>) -> tensor<2x1x32x11536xf16>
%15 = tensor.empty() : tensor<11536x16xf16>
%16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded_2 : tensor<11536x16xf16>) outs(%15 : tensor<11536x16xf16>) -> tensor<11536x16xf16>
%17 = tensor.empty() : tensor<2x1x2x721x16x16xf16>
%pack = tensor.pack %14 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %17 : tensor<2x1x32x11536xf16> -> tensor<2x1x2x721x16x16xf16>
%18 = tensor.empty() : tensor<721x1x16x16xf16>
%pack_4 = tensor.pack %16 inner_dims_pos = [1, 0] inner_tiles = [16, 16] into %18 : tensor<11536x16xf16> -> tensor<721x1x16x16xf16>
%19 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%pack_5 = tensor.pack %padded_3 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %19 : tensor<2x1x32x16xf32> -> tensor<2x1x2x1x16x16xf32>
%20 = iree_gpu.multi_mma %pack, %pack_4, %pack_5 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x721x16x16xf16>, tensor<721x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
%unpack = tensor.unpack %20 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %padded_3 : tensor<2x1x2x1x16x16xf32> -> tensor<2x1x32x16xf32>
%21 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%22 = linalg.copy ins(%unpack : tensor<2x1x32x16xf32>) outs(%21 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%extracted_slice_6 = tensor.extract_slice %22[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
%23 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%24 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_6 : tensor<2x1x17x?xf32>) outs(%23 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %24 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After DecomposeBoundaryPackUnPackOpsPass (iree-codegen-decompose-boundary-pack-unpack-ops) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f16
} : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
%10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
%padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
^bb0(%arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<11529x?xf16> to tensor<11536x16xf16>
%11 = tensor.empty() : tensor<2x1x32x16xf32>
%12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%13 = tensor.empty() : tensor<2x1x32x11536xf16>
%14 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded : tensor<2x1x32x11536xf16>) outs(%13 : tensor<2x1x32x11536xf16>) -> tensor<2x1x32x11536xf16>
%15 = tensor.empty() : tensor<11536x16xf16>
%16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded_1 : tensor<11536x16xf16>) outs(%15 : tensor<11536x16xf16>) -> tensor<11536x16xf16>
%17 = tensor.empty() : tensor<2x1x2x721x16x16xf16>
%pack = tensor.pack %14 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %17 : tensor<2x1x32x11536xf16> -> tensor<2x1x2x721x16x16xf16>
%18 = tensor.empty() : tensor<721x1x16x16xf16>
%pack_2 = tensor.pack %16 inner_dims_pos = [1, 0] inner_tiles = [16, 16] into %18 : tensor<11536x16xf16> -> tensor<721x1x16x16xf16>
%19 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%20 = linalg.fill ins(%cst_0 : f32) outs(%19 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%21 = iree_gpu.multi_mma %pack, %pack_2, %20 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x721x16x16xf16>, tensor<721x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
%unpack = tensor.unpack %21 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %12 : tensor<2x1x2x1x16x16xf32> -> tensor<2x1x32x16xf32>
%22 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%23 = linalg.copy ins(%unpack : tensor<2x1x32x16xf32>) outs(%22 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%extracted_slice_3 = tensor.extract_slice %23[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
%24 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x17x?xf32>) outs(%24 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %25 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After ConcretizeMmaShapesPass (iree-gpu-concretize-mma-shapes) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f16
} : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
%10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
%padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
^bb0(%arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<11529x?xf16> to tensor<11536x16xf16>
%11 = tensor.empty() : tensor<2x1x32x16xf32>
%12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%13 = tensor.empty() : tensor<2x1x32x11536xf16>
%14 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded : tensor<2x1x32x11536xf16>) outs(%13 : tensor<2x1x32x11536xf16>) -> tensor<2x1x32x11536xf16>
%15 = tensor.empty() : tensor<11536x16xf16>
%16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded_1 : tensor<11536x16xf16>) outs(%15 : tensor<11536x16xf16>) -> tensor<11536x16xf16>
%17 = tensor.empty() : tensor<2x1x2x721x16x16xf16>
%pack = tensor.pack %14 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %17 : tensor<2x1x32x11536xf16> -> tensor<2x1x2x721x16x16xf16>
%18 = tensor.empty() : tensor<721x1x16x16xf16>
%pack_2 = tensor.pack %16 inner_dims_pos = [1, 0] inner_tiles = [16, 16] into %18 : tensor<11536x16xf16> -> tensor<721x1x16x16xf16>
%19 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%20 = linalg.fill ins(%cst_0 : f32) outs(%19 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%21 = iree_gpu.multi_mma %pack, %pack_2, %20 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x721x16x16xf16>, tensor<721x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
%unpack = tensor.unpack %21 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %12 : tensor<2x1x2x1x16x16xf32> -> tensor<2x1x32x16xf32>
%22 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%23 = linalg.copy ins(%unpack : tensor<2x1x32x16xf32>) outs(%22 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%extracted_slice_3 = tensor.extract_slice %23[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
%24 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x17x?xf32>) outs(%24 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %25 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After PropagateReshapesByExpansionPass (iree-codegen-propagate-reshapes-by-expansion) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f16
} : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
%10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
%padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
^bb0(%arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<11529x?xf16> to tensor<11536x16xf16>
%11 = tensor.empty() : tensor<2x1x32x16xf32>
%12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%13 = tensor.empty() : tensor<2x1x32x11536xf16>
%14 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded : tensor<2x1x32x11536xf16>) outs(%13 : tensor<2x1x32x11536xf16>) -> tensor<2x1x32x11536xf16>
%15 = tensor.empty() : tensor<11536x16xf16>
%16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%padded_1 : tensor<11536x16xf16>) outs(%15 : tensor<11536x16xf16>) -> tensor<11536x16xf16>
%17 = tensor.empty() : tensor<2x1x2x721x16x16xf16>
%pack = tensor.pack %14 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %17 : tensor<2x1x32x11536xf16> -> tensor<2x1x2x721x16x16xf16>
%18 = tensor.empty() : tensor<721x1x16x16xf16>
%pack_2 = tensor.pack %16 inner_dims_pos = [1, 0] inner_tiles = [16, 16] into %18 : tensor<11536x16xf16> -> tensor<721x1x16x16xf16>
%19 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%20 = linalg.fill ins(%cst_0 : f32) outs(%19 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%21 = iree_gpu.multi_mma %pack, %pack_2, %20 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x721x16x16xf16>, tensor<721x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
%unpack = tensor.unpack %21 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %12 : tensor<2x1x2x1x16x16xf32> -> tensor<2x1x32x16xf32>
%22 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%23 = linalg.copy ins(%unpack : tensor<2x1x32x16xf32>) outs(%22 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%extracted_slice_3 = tensor.extract_slice %23[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
%24 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x17x?xf32>) outs(%24 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %25 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f16
} : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
%10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
%padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
^bb0(%arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<11529x?xf16> to tensor<11536x16xf16>
%11 = tensor.empty() : tensor<2x1x32x16xf32>
%12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%13 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%14 = linalg.fill ins(%cst_0 : f32) outs(%13 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%15 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %14) -> (tensor<2x1x2x1x16x16xf32>) {
%20 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %20] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
%21 = tensor.empty() : tensor<2x1x32x16xf16>
%22 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%21 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
%23 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
%pack = tensor.pack %22 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %23 : tensor<2x1x32x16xf16> -> tensor<2x1x2x1x16x16xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_4 = tensor.extract_slice %padded_1[%24, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
%25 = tensor.empty() : tensor<16x16xf16>
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<16x16xf16>) outs(%25 : tensor<16x16xf16>) -> tensor<16x16xf16>
%27 = tensor.empty() : tensor<1x1x16x16xf16>
%pack_5 = tensor.pack %26 inner_dims_pos = [1, 0] inner_tiles = [16, 16] into %27 : tensor<16x16xf16> -> tensor<1x1x16x16xf16>
%28 = iree_gpu.multi_mma %pack, %pack_5, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
scf.yield %28 : tensor<2x1x2x1x16x16xf32>
}
%unpack = tensor.unpack %15 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %12 : tensor<2x1x2x1x16x16xf32> -> tensor<2x1x32x16xf32>
%16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%17 = linalg.copy ins(%unpack : tensor<2x1x32x16xf32>) outs(%16 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%extracted_slice_2 = tensor.extract_slice %17[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
%18 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%19 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%18 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f16
} : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
%10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
%padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
^bb0(%arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<11529x?xf16> to tensor<11536x16xf16>
%11 = tensor.empty() : tensor<2x1x32x16xf32>
%12 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%13 = linalg.fill ins(%cst_0 : f32) outs(%12 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%14 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %13) -> (tensor<2x1x2x1x16x16xf32>) {
%19 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %19] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
%20 = tensor.empty() : tensor<2x1x32x16xf16>
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%20 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
%22 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
%pack = tensor.pack %21 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %22 : tensor<2x1x32x16xf16> -> tensor<2x1x2x1x16x16xf16>
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_4 = tensor.extract_slice %padded_1[%23, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
%24 = tensor.empty() : tensor<16x16xf16>
%25 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<16x16xf16>) outs(%24 : tensor<16x16xf16>) -> tensor<16x16xf16>
%26 = tensor.empty() : tensor<1x1x16x16xf16>
%pack_5 = tensor.pack %25 inner_dims_pos = [1, 0] inner_tiles = [16, 16] into %26 : tensor<16x16xf16> -> tensor<1x1x16x16xf16>
%27 = iree_gpu.multi_mma %pack, %pack_5, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
scf.yield %27 : tensor<2x1x2x1x16x16xf32>
}
%unpack = tensor.unpack %14 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %11 : tensor<2x1x2x1x16x16xf32> -> tensor<2x1x32x16xf32>
%15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%16 = linalg.copy ins(%unpack : tensor<2x1x32x16xf32>) outs(%15 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%extracted_slice_2 = tensor.extract_slice %16[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
%17 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%18 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%17 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f16
} : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
%10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
%padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
^bb0(%arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<11529x?xf16> to tensor<11536x16xf16>
%11 = tensor.empty() : tensor<2x1x32x16xf32>
%12 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%13 = linalg.fill ins(%cst_0 : f32) outs(%12 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%14 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %13) -> (tensor<2x1x2x1x16x16xf32>) {
%19 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %19] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
%20 = tensor.empty() : tensor<2x1x32x16xf16>
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%20 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
%22 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
%pack = tensor.pack %21 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %22 : tensor<2x1x32x16xf16> -> tensor<2x1x2x1x16x16xf16>
%extracted_slice_4 = tensor.extract_slice %padded_1[%19, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
%23 = tensor.empty() : tensor<16x16xf16>
%24 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<16x16xf16>) outs(%23 : tensor<16x16xf16>) -> tensor<16x16xf16>
%25 = tensor.empty() : tensor<1x1x16x16xf16>
%pack_5 = tensor.pack %24 inner_dims_pos = [1, 0] inner_tiles = [16, 16] into %25 : tensor<16x16xf16> -> tensor<1x1x16x16xf16>
%26 = iree_gpu.multi_mma %pack, %pack_5, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
scf.yield %26 : tensor<2x1x2x1x16x16xf32>
}
%unpack = tensor.unpack %14 inner_dims_pos = [2, 3] inner_tiles = [16, 16] into %11 : tensor<2x1x2x1x16x16xf32> -> tensor<2x1x32x16xf32>
%15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%16 = linalg.copy ins(%unpack : tensor<2x1x32x16xf32>) outs(%15 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%extracted_slice_2 = tensor.extract_slice %16[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
%17 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%18 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%17 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After DecomposePackUnPackOpsPass (iree-codegen-decompose-pack-unpack-ops) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f16
} : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
%10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
%padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
^bb0(%arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<11529x?xf16> to tensor<11536x16xf16>
%11 = tensor.empty() : tensor<2x1x32x16xf32>
%12 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%13 = linalg.fill ins(%cst_0 : f32) outs(%12 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%14 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %13) -> (tensor<2x1x2x1x16x16xf32>) {
%21 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %21] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
%22 = tensor.empty() : tensor<2x1x32x16xf16>
%23 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%22 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
%24 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
%expanded = tensor.expand_shape %23 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%transposed_4 = linalg.transpose ins(%expanded : tensor<2x1x2x16x1x16xf16>) outs(%24 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5]
%extracted_slice_5 = tensor.extract_slice %padded_1[%21, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
%25 = tensor.empty() : tensor<16x16xf16>
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<16x16xf16>) outs(%25 : tensor<16x16xf16>) -> tensor<16x16xf16>
%27 = tensor.empty() : tensor<1x1x16x16xf16>
%expanded_6 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%transposed_7 = linalg.transpose ins(%expanded_6 : tensor<1x16x1x16xf16>) outs(%27 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1]
%28 = iree_gpu.multi_mma %transposed_4, %transposed_7, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
scf.yield %28 : tensor<2x1x2x1x16x16xf32>
}
%15 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%14 : tensor<2x1x2x1x16x16xf32>) outs(%15 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%16 = linalg.copy ins(%collapsed : tensor<2x1x32x16xf32>) outs(%11 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%17 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%18 = linalg.copy ins(%16 : tensor<2x1x32x16xf32>) outs(%17 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%extracted_slice_2 = tensor.extract_slice %18[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
%19 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%19 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %20 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After ConcretizeMmaShapesPass (iree-gpu-concretize-mma-shapes) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f16
} : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
%10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
%padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
^bb0(%arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<11529x?xf16> to tensor<11536x16xf16>
%11 = tensor.empty() : tensor<2x1x32x16xf32>
%12 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%13 = linalg.fill ins(%cst_0 : f32) outs(%12 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%14 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %13) -> (tensor<2x1x2x1x16x16xf32>) {
%21 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %21] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
%22 = tensor.empty() : tensor<2x1x32x16xf16>
%23 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%22 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
%24 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
%expanded = tensor.expand_shape %23 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%transposed_4 = linalg.transpose ins(%expanded : tensor<2x1x2x16x1x16xf16>) outs(%24 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5]
%extracted_slice_5 = tensor.extract_slice %padded_1[%21, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
%25 = tensor.empty() : tensor<16x16xf16>
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<16x16xf16>) outs(%25 : tensor<16x16xf16>) -> tensor<16x16xf16>
%27 = tensor.empty() : tensor<1x1x16x16xf16>
%expanded_6 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%transposed_7 = linalg.transpose ins(%expanded_6 : tensor<1x16x1x16xf16>) outs(%27 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1]
%28 = iree_gpu.multi_mma %transposed_4, %transposed_7, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
scf.yield %28 : tensor<2x1x2x1x16x16xf32>
}
%15 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%14 : tensor<2x1x2x1x16x16xf32>) outs(%15 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%16 = linalg.copy ins(%collapsed : tensor<2x1x32x16xf32>) outs(%11 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%17 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%18 = linalg.copy ins(%16 : tensor<2x1x32x16xf32>) outs(%17 : tensor<2x1x32x16xf32>) -> tensor<2x1x32x16xf32>
%extracted_slice_2 = tensor.extract_slice %18[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
%19 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%19 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %20 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After PropagateReshapesByExpansionPass (iree-codegen-propagate-reshapes-by-expansion) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f16
} : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
%10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
%padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
^bb0(%arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<11529x?xf16> to tensor<11536x16xf16>
%11 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%13 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %12) -> (tensor<2x1x2x1x16x16xf32>) {
%21 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %21] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
%22 = tensor.empty() : tensor<2x1x32x16xf16>
%23 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%22 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
%24 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
%expanded_4 = tensor.expand_shape %23 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%transposed_5 = linalg.transpose ins(%expanded_4 : tensor<2x1x2x16x1x16xf16>) outs(%24 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5]
%extracted_slice_6 = tensor.extract_slice %padded_1[%21, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
%25 = tensor.empty() : tensor<16x16xf16>
%26 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_6 : tensor<16x16xf16>) outs(%25 : tensor<16x16xf16>) -> tensor<16x16xf16>
%27 = tensor.empty() : tensor<1x1x16x16xf16>
%expanded_7 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%transposed_8 = linalg.transpose ins(%expanded_7 : tensor<1x16x1x16xf16>) outs(%27 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1]
%28 = iree_gpu.multi_mma %transposed_5, %transposed_8, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
scf.yield %28 : tensor<2x1x2x1x16x16xf32>
}
%14 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%13 : tensor<2x1x2x1x16x16xf32>) outs(%14 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%15 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%transposed : tensor<2x1x2x16x1x16xf32>) outs(%15 : tensor<2x1x2x16x1x16xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<2x1x2x16x1x16xf32>
%17 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%expanded = tensor.expand_shape %17 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf32> into tensor<2x1x2x16x1x16xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<2x1x2x16x1x16xf32>) outs(%expanded : tensor<2x1x2x16x1x16xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<2x1x2x16x1x16xf32>
%collapsed = tensor.collapse_shape %18 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%extracted_slice_2 = tensor.extract_slice %collapsed[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
%19 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%19 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %20 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f16
} : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
%10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
%padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
^bb0(%arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<11529x?xf16> to tensor<11536x16xf16>
%11 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%13 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %12) -> (tensor<2x1x2x1x16x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %18] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
%19 = tensor.empty() : tensor<2x1x32x16xf16>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%19 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
%21 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
%expanded = tensor.expand_shape %20 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%transposed_4 = linalg.transpose ins(%expanded : tensor<2x1x2x16x1x16xf16>) outs(%21 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5]
%extracted_slice_5 = tensor.extract_slice %padded_1[%18, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
%22 = tensor.empty() : tensor<16x16xf16>
%23 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<16x16xf16>) outs(%22 : tensor<16x16xf16>) -> tensor<16x16xf16>
%24 = tensor.empty() : tensor<1x1x16x16xf16>
%expanded_6 = tensor.expand_shape %23 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%transposed_7 = linalg.transpose ins(%expanded_6 : tensor<1x16x1x16xf16>) outs(%24 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1]
%25 = iree_gpu.multi_mma %transposed_4, %transposed_7, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
scf.yield %25 : tensor<2x1x2x1x16x16xf32>
}
%14 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%13 : tensor<2x1x2x1x16x16xf32>) outs(%14 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%extracted_slice_2 = tensor.extract_slice %collapsed[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
%16 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%17 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%16 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x17x11529xf16>
%9 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %c0] * [17, 1] k_offset = [0] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%3 : tensor<2x35x35x1281xf16>) outs(%8 : tensor<2x1x17x11529xf16>) -> tensor<2x1x17x11529xf16>
%extracted_slice = tensor.extract_slice %5[0, %arg1] [11529, %7] [1, 1] : tensor<11529x1281xf16> to tensor<11529x?xf16>
%padded = tensor.pad %9 low[0, 0, 0, 0] high[0, 0, 15, 7] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f16
} : tensor<2x1x17x11529xf16> to tensor<2x1x32x11536xf16>
%10 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%7)
%padded_1 = tensor.pad %extracted_slice low[0, 0] high[7, %10] {
^bb0(%arg3: index, %arg4: index):
tensor.yield %cst : f16
} : tensor<11529x?xf16> to tensor<11536x16xf16>
%11 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%13 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %12) -> (tensor<2x1x2x1x16x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_3 = tensor.extract_slice %padded[0, 0, 0, %18] [2, 1, 32, 16] [1, 1, 1, 1] : tensor<2x1x32x11536xf16> to tensor<2x1x32x16xf16>
%19 = tensor.empty() : tensor<2x1x32x16xf16>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<2x1x32x16xf16>) outs(%19 : tensor<2x1x32x16xf16>) -> tensor<2x1x32x16xf16>
%21 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
%expanded = tensor.expand_shape %20 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%transposed_4 = linalg.transpose ins(%expanded : tensor<2x1x2x16x1x16xf16>) outs(%21 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5]
%extracted_slice_5 = tensor.extract_slice %padded_1[%18, 0] [16, 16] [1, 1] : tensor<11536x16xf16> to tensor<16x16xf16>
%22 = tensor.empty() : tensor<16x16xf16>
%23 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_5 : tensor<16x16xf16>) outs(%22 : tensor<16x16xf16>) -> tensor<16x16xf16>
%24 = tensor.empty() : tensor<1x1x16x16xf16>
%expanded_6 = tensor.expand_shape %23 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%transposed_7 = linalg.transpose ins(%expanded_6 : tensor<1x16x1x16xf16>) outs(%24 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1]
%25 = iree_gpu.multi_mma %transposed_4, %transposed_7, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
scf.yield %25 : tensor<2x1x2x1x16x16xf32>
}
%14 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%13 : tensor<2x1x2x1x16x16xf32>) outs(%14 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%extracted_slice_2 = tensor.extract_slice %collapsed[0, 0, 0, 0] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<2x1x17x?xf32>
%16 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%17 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<2x1x17x?xf32>) outs(%16 : tensor<2x1x17x?xf32>) -> tensor<2x1x17x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%10 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %9) -> (tensor<2x1x2x1x16x16xf32>) {
%15 = tensor.empty() : tensor<2x1x32x16xf16>
%16 = scf.forall (%arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0) to (2, 1, 32, 16) step (1, 1, 1, 4) shared_outs(%arg9 = %15) -> (tensor<2x1x32x16xf16>) {
%22 = affine.min affine_map<(d0) -> (d0, 2)>(%arg5)
%23 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%22)
%24 = arith.cmpi eq, %23, %c0 : index
%25 = affine.min affine_map<(d0) -> (d0, 1)>(%arg6)
%26 = affine.min affine_map<(d0) -> (-d0 + 1, 1)>(%25)
%27 = arith.cmpi eq, %26, %c0 : index
%28 = arith.ori %27, %24 : i1
%29 = affine.min affine_map<(d0) -> (d0, 17)>(%arg7)
%30 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%29)
%31 = arith.cmpi eq, %30, %c0 : index
%32 = arith.ori %31, %28 : i1
%33 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%30)
%34 = affine.min affine_map<(d0)[s0] -> (d0 * 16 + s0, 11529)>(%arg3)[%arg8]
%35 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = arith.ori %36, %32 : i1
%38 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%35)
%39 = scf.if %37 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_4 = tensor.extract_slice %3[%22, 0, 0, 0] [%23, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%41 = tensor.empty(%23, %26, %30, %35) : tensor<?x?x?x?xf16>
%42 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%25, %arg0)
%43 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%42, %29] * [17, 1] k_offset = [%34] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_4 : tensor<?x35x35x1281xf16>) outs(%41 : tensor<?x?x?x?xf16>) -> tensor<?x?x?x?xf16>
%padded = tensor.pad %43 low[0, 0, 0, 0] high[0, 0, %33, %38] {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<?x?x?x?xf16> to tensor<?x?x?x?xf16>
%cast = tensor.cast %padded : tensor<?x?x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %cast : tensor<1x1x1x4xf16>
}
%extracted_slice = tensor.extract_slice %arg9[%arg5, %arg6, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%40 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%39 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %40 into %arg9[%arg5, %arg6, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%17 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
%expanded = tensor.expand_shape %16 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%transposed_1 = linalg.transpose ins(%expanded : tensor<2x1x2x16x1x16xf16>) outs(%17 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5]
%18 = tensor.empty() : tensor<16x16xf16>
%19 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %18) -> (tensor<16x16xf16>) {
%22 = affine.min affine_map<(d0)[s0] -> (d0 * 16 + s0, 11529)>(%arg3)[%arg5]
%23 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%22)
%24 = arith.cmpi eq, %23, %c0 : index
%25 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%23)
%26 = affine.min affine_map<(d0, d1) -> (d0, d1)>(%arg6, %7)
%27 = affine.min affine_map<(d0, d1) -> (d0 - d1, 1)>(%7, %26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %24 : i1
%30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
%31 = scf.if %29 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%33 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%26, %arg1]
%extracted_slice_4 = tensor.extract_slice %5[%22, %33] [%23, %27] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_4 low[0, 0] high[%25, %30] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<?x?xf16>
%cast = tensor.cast %padded : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %cast : tensor<1x1xf16>
}
%extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%32 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%31 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %32 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%20 = tensor.empty() : tensor<1x1x16x16xf16>
%expanded_2 = tensor.expand_shape %19 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%transposed_3 = linalg.transpose ins(%expanded_2 : tensor<1x16x1x16xf16>) outs(%20 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1]
%21 = iree_gpu.multi_mma %transposed_1, %transposed_3, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
scf.yield %21 : tensor<2x1x2x1x16x16xf32>
}
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%10 : tensor<2x1x2x1x16x16xf32>) outs(%11 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%13 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%14 = scf.forall (%arg3, %arg4, %arg5, %arg6) = (0, 0, 0, 0) to (2, 1, 17, %7) step (1, 1, 1, 4) shared_outs(%arg7 = %13) -> (tensor<2x1x17x?xf32>) {
%15 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg6)[%7]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg7[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%10 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %9) -> (tensor<2x1x2x1x16x16xf32>) {
%15 = tensor.empty() : tensor<2x1x32x16xf16>
%16 = scf.forall (%arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0) to (2, 1, 32, 16) step (1, 1, 1, 4) shared_outs(%arg9 = %15) -> (tensor<2x1x32x16xf16>) {
%22 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
%23 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%22)
%24 = arith.cmpi eq, %23, %c0 : index
%25 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
%26 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%25)
%27 = arith.cmpi eq, %26, %c0 : index
%28 = arith.ori %27, %24 : i1
%29 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%26)
%30 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg8]
%31 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%30)
%32 = arith.cmpi eq, %31, %c0 : index
%33 = arith.ori %32, %28 : i1
%34 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%31)
%35 = scf.if %33 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_4 = tensor.extract_slice %3[%22, 0, 0, 0] [%23, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%37 = tensor.empty(%23, %26, %31) : tensor<?x1x?x?xf16>
%38 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %25] * [17, 1] k_offset = [%30] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_4 : tensor<?x35x35x1281xf16>) outs(%37 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %38 low[0, 0, 0, 0] high[0, 0, %29, %34] {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%36 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%35 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %36 into %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%17 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
%expanded = tensor.expand_shape %16 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%transposed_1 = linalg.transpose ins(%expanded : tensor<2x1x2x16x1x16xf16>) outs(%17 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5]
%18 = tensor.empty() : tensor<16x16xf16>
%19 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %18) -> (tensor<16x16xf16>) {
%22 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
%23 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%22)
%24 = arith.cmpi eq, %23, %c0 : index
%25 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%23)
%26 = affine.min affine_map<(d0, d1) -> (-d1 + 1281, 16, d0)>(%arg6, %arg1)
%27 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%7, %26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %24 : i1
%30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
%31 = scf.if %29 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%33 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%26, %arg1]
%extracted_slice_4 = tensor.extract_slice %5[%22, %33] [%23, %27] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_4 low[0, 0] high[%25, %30] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%32 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%31 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %32 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%20 = tensor.empty() : tensor<1x1x16x16xf16>
%expanded_2 = tensor.expand_shape %19 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%transposed_3 = linalg.transpose ins(%expanded_2 : tensor<1x16x1x16xf16>) outs(%20 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1]
%21 = iree_gpu.multi_mma %transposed_1, %transposed_3, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
scf.yield %21 : tensor<2x1x2x1x16x16xf32>
}
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%10 : tensor<2x1x2x1x16x16xf32>) outs(%11 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%13 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%14 = scf.forall (%arg3, %arg4, %arg5, %arg6) = (0, 0, 0, 0) to (2, 1, 17, %7) step (1, 1, 1, 4) shared_outs(%arg7 = %13) -> (tensor<2x1x17x?xf32>) {
%15 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg6)[%7]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%10 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %9) -> (tensor<2x1x2x1x16x16xf32>) {
%15 = tensor.empty() : tensor<2x1x32x16xf16>
%16 = scf.forall (%arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0) to (2, 1, 32, 16) step (1, 1, 1, 4) shared_outs(%arg9 = %15) -> (tensor<2x1x32x16xf16>) {
%22 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
%23 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%22)
%24 = arith.cmpi eq, %23, %c0 : index
%25 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
%26 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%25)
%27 = arith.cmpi eq, %26, %c0 : index
%28 = arith.ori %27, %24 : i1
%29 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%26)
%30 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg8]
%31 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%30)
%32 = arith.cmpi eq, %31, %c0 : index
%33 = arith.ori %32, %28 : i1
%34 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%31)
%35 = scf.if %33 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_4 = tensor.extract_slice %3[%22, 0, 0, 0] [%23, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%37 = tensor.empty(%23, %26, %31) : tensor<?x1x?x?xf16>
%38 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %25] * [17, 1] k_offset = [%30] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_4 : tensor<?x35x35x1281xf16>) outs(%37 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %38 low[0, 0, 0, 0] high[0, 0, %29, %34] {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%36 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%35 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %36 into %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%17 = tensor.empty() : tensor<2x1x2x1x16x16xf16>
%expanded = tensor.expand_shape %16 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%transposed_1 = linalg.transpose ins(%expanded : tensor<2x1x2x16x1x16xf16>) outs(%17 : tensor<2x1x2x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5]
%18 = tensor.empty() : tensor<16x16xf16>
%19 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %18) -> (tensor<16x16xf16>) {
%22 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
%23 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%22)
%24 = arith.cmpi eq, %23, %c0 : index
%25 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%23)
%26 = affine.min affine_map<(d0, d1) -> (-d1 + 1281, 16, d0)>(%arg6, %arg1)
%27 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%7, %26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %24 : i1
%30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
%31 = scf.if %29 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%33 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%26, %arg1]
%extracted_slice_4 = tensor.extract_slice %5[%22, %33] [%23, %27] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_4 low[0, 0] high[%25, %30] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%32 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%31 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %32 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%20 = tensor.empty() : tensor<1x1x16x16xf16>
%expanded_2 = tensor.expand_shape %19 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%transposed_3 = linalg.transpose ins(%expanded_2 : tensor<1x16x1x16xf16>) outs(%20 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1]
%21 = iree_gpu.multi_mma %transposed_1, %transposed_3, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<2x1x2x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<2x1x2x1x16x16xf32>
scf.yield %21 : tensor<2x1x2x1x16x16xf32>
}
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%10 : tensor<2x1x2x1x16x16xf32>) outs(%11 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%13 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%14 = scf.forall (%arg3, %arg4, %arg5, %arg6) = (0, 0, 0, 0) to (2, 1, 17, %7) step (1, 1, 1, 4) shared_outs(%arg7 = %13) -> (tensor<2x1x17x?xf32>) {
%15 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg6)[%7]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%10 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %9) -> (tensor<2x1x2x1x16x16xf32>) {
%15 = tensor.empty() : tensor<2x1x32x16xf16>
%16 = scf.forall (%arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0) to (2, 1, 32, 16) step (1, 1, 1, 4) shared_outs(%arg9 = %15) -> (tensor<2x1x32x16xf16>) {
%20 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
%21 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
%24 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%23)
%25 = arith.cmpi eq, %24, %c0 : index
%26 = arith.ori %25, %22 : i1
%27 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%24)
%28 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg8]
%29 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%28)
%30 = arith.cmpi eq, %29, %c0 : index
%31 = arith.ori %30, %26 : i1
%32 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%29)
%33 = scf.if %31 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_2 = tensor.extract_slice %3[%20, 0, 0, 0] [%21, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%35 = tensor.empty(%21, %24, %29) : tensor<?x1x?x?xf16>
%36 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %23] * [17, 1] k_offset = [%28] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%35 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %36 low[0, 0, 0, 0] high[0, 0, %27, %32] {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%34 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%33 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %34 into %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded = tensor.expand_shape %16 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%17 = tensor.empty() : tensor<16x16xf16>
%18 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %17) -> (tensor<16x16xf16>) {
%20 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
%21 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%21)
%24 = affine.min affine_map<(d0, d1) -> (-d1 + 1281, 16, d0)>(%arg6, %arg1)
%25 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%7, %24)
%26 = arith.cmpi eq, %25, %c0 : index
%27 = arith.ori %26, %22 : i1
%28 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%25)
%29 = scf.if %27 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%31 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%24, %arg1]
%extracted_slice_2 = tensor.extract_slice %5[%20, %31] [%21, %25] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_2 low[0, 0] high[%23, %28] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%30 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%29 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %30 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded_1 = tensor.expand_shape %18 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%19 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
%extracted_slice = tensor.extract_slice %expanded[%arg5, %arg6, %arg7, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x16x1x16xf16>
%20 = tensor.empty() : tensor<1x1x1x1x16x16xf16>
%transposed_2 = linalg.transpose ins(%extracted_slice : tensor<1x1x1x16x1x16xf16>) outs(%20 : tensor<1x1x1x1x16x16xf16>) permutation = [0, 1, 2, 4, 3, 5]
%extracted_slice_3 = tensor.extract_slice %expanded_1[0, 0, %arg8, 0] [1, 16, 1, 16] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x16x1x16xf16>
%21 = tensor.empty() : tensor<1x1x16x16xf16>
%transposed_4 = linalg.transpose ins(%extracted_slice_3 : tensor<1x16x1x16xf16>) outs(%21 : tensor<1x1x16x16xf16>) permutation = [0, 2, 3, 1]
%extracted_slice_5 = tensor.extract_slice %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
%22 = iree_gpu.multi_mma %transposed_2, %transposed_4, %extracted_slice_5 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>, rhs_permutation = array<i64: 1, 0>} : tensor<1x1x1x1x16x16xf16>, tensor<1x1x16x16xf16> into tensor<1x1x1x1x16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %22 into %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
scf.yield %19 : tensor<2x1x2x1x16x16xf32>
}
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%10 : tensor<2x1x2x1x16x16xf32>) outs(%11 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%13 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%14 = scf.forall (%arg3, %arg4, %arg5, %arg6) = (0, 0, 0, 0) to (2, 1, 17, %7) step (1, 1, 1, 4) shared_outs(%arg7 = %13) -> (tensor<2x1x17x?xf32>) {
%15 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg6)[%7]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After DistributeMmaToLanesPass (iree-gpu-distribute-mma-to-lanes) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%10 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %9) -> (tensor<2x1x2x1x16x16xf32>) {
%15 = tensor.empty() : tensor<2x1x32x16xf16>
%16 = scf.forall (%arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0) to (2, 1, 32, 16) step (1, 1, 1, 4) shared_outs(%arg9 = %15) -> (tensor<2x1x32x16xf16>) {
%20 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
%21 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
%24 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%23)
%25 = arith.cmpi eq, %24, %c0 : index
%26 = arith.ori %25, %22 : i1
%27 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%24)
%28 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg8]
%29 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%28)
%30 = arith.cmpi eq, %29, %c0 : index
%31 = arith.ori %30, %26 : i1
%32 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%29)
%33 = scf.if %31 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_2 = tensor.extract_slice %3[%20, 0, 0, 0] [%21, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%35 = tensor.empty(%21, %24, %29) : tensor<?x1x?x?xf16>
%36 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %23] * [17, 1] k_offset = [%28] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%35 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %36 low[0, 0, 0, 0] high[0, 0, %27, %32] {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%34 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%33 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %34 into %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded = tensor.expand_shape %16 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%17 = tensor.empty() : tensor<16x16xf16>
%18 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %17) -> (tensor<16x16xf16>) {
%20 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
%21 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%21)
%24 = affine.min affine_map<(d0, d1) -> (-d1 + 1281, 16, d0)>(%arg6, %arg1)
%25 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%7, %24)
%26 = arith.cmpi eq, %25, %c0 : index
%27 = arith.ori %26, %22 : i1
%28 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%25)
%29 = scf.if %27 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%31 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%24, %arg1]
%extracted_slice_2 = tensor.extract_slice %5[%20, %31] [%21, %25] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_2 low[0, 0] high[%23, %28] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%30 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%29 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %30 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded_1 = tensor.expand_shape %18 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%19 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
%20 = scf.forall (%arg10) in (64) shared_outs(%arg11 = %extracted_slice) -> (tensor<1x1x1x1x16x16xf32>) {
%21 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%22 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
%extracted_slice_2 = tensor.extract_slice %expanded[%arg5, %arg6, %arg7, %21, 0, %22] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%23 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%23 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
%25 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%extracted_slice_4 = tensor.extract_slice %expanded_1[0, %24, %arg8, %25] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%26 = tensor.empty() : tensor<1x1x1x4xf16>
%transposed_5 = linalg.transpose ins(%extracted_slice_4 : tensor<1x4x1x1xf16>) outs(%26 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%27 = affine.apply affine_map<(d0) -> (((d0 floordiv 16) mod 4) * 4)>(%arg10)
%28 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%extracted_slice_6 = tensor.extract_slice %arg11[0, 0, 0, 0, %27, %28] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> to tensor<1x1x1x1x4x1xf32>
%29 = iree_gpu.multi_mma %transposed_3, %transposed_5, %extracted_slice_6 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %29 into %arg11[0, 0, 0, 0, %27, %28] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x1xf32> into tensor<1x1x1x1x16x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %20 into %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
scf.yield %19 : tensor<2x1x2x1x16x16xf32>
}
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%10 : tensor<2x1x2x1x16x16xf32>) outs(%11 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%13 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%14 = scf.forall (%arg3, %arg4, %arg5, %arg6) = (0, 0, 0, 0) to (2, 1, 17, %7) step (1, 1, 1, 4) shared_outs(%arg7 = %13) -> (tensor<2x1x17x?xf32>) {
%15 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg6)[%7]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After GPULowerToUKernelsPass (iree-codegen-gpu-lower-to-ukernels) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (17, 1281) step (1, 16) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%arg1)
%8 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%10 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %9) -> (tensor<2x1x2x1x16x16xf32>) {
%15 = tensor.empty() : tensor<2x1x32x16xf16>
%16 = scf.forall (%arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0) to (2, 1, 32, 16) step (1, 1, 1, 4) shared_outs(%arg9 = %15) -> (tensor<2x1x32x16xf16>) {
%20 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
%21 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
%24 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%23)
%25 = arith.cmpi eq, %24, %c0 : index
%26 = arith.ori %25, %22 : i1
%27 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%24)
%28 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg8]
%29 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%28)
%30 = arith.cmpi eq, %29, %c0 : index
%31 = arith.ori %30, %26 : i1
%32 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%29)
%33 = scf.if %31 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_2 = tensor.extract_slice %3[%20, 0, 0, 0] [%21, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%35 = tensor.empty(%21, %24, %29) : tensor<?x1x?x?xf16>
%36 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %23] * [17, 1] k_offset = [%28] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%35 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %36 low[0, 0, 0, 0] high[0, 0, %27, %32] {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%34 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%33 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %34 into %arg9[%arg5, 0, %arg7, %arg8] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded = tensor.expand_shape %16 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%17 = tensor.empty() : tensor<16x16xf16>
%18 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %17) -> (tensor<16x16xf16>) {
%20 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
%21 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%21)
%24 = affine.min affine_map<(d0, d1) -> (-d1 + 1281, 16, d0)>(%arg6, %arg1)
%25 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%7, %24)
%26 = arith.cmpi eq, %25, %c0 : index
%27 = arith.ori %26, %22 : i1
%28 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%25)
%29 = scf.if %27 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%31 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%24, %arg1]
%extracted_slice_2 = tensor.extract_slice %5[%20, %31] [%21, %25] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_2 low[0, 0] high[%23, %28] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%30 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%29 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %30 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded_1 = tensor.expand_shape %18 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%19 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
%20 = scf.forall (%arg10) in (64) shared_outs(%arg11 = %extracted_slice) -> (tensor<1x1x1x1x16x16xf32>) {
%21 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%22 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
%extracted_slice_2 = tensor.extract_slice %expanded[%arg5, %arg6, %arg7, %21, 0, %22] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%23 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%23 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
%25 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%extracted_slice_4 = tensor.extract_slice %expanded_1[0, %24, %arg8, %25] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%26 = tensor.empty() : tensor<1x1x1x4xf16>
%transposed_5 = linalg.transpose ins(%extracted_slice_4 : tensor<1x4x1x1xf16>) outs(%26 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%27 = affine.apply affine_map<(d0) -> (((d0 floordiv 16) mod 4) * 4)>(%arg10)
%28 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%extracted_slice_6 = tensor.extract_slice %arg11[0, 0, 0, 0, %27, %28] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> to tensor<1x1x1x1x4x1xf32>
%29 = iree_gpu.multi_mma %transposed_3, %transposed_5, %extracted_slice_6 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %29 into %arg11[0, 0, 0, 0, %27, %28] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x1xf32> into tensor<1x1x1x1x16x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %20 into %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
scf.yield %19 : tensor<2x1x2x1x16x16xf32>
}
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%10 : tensor<2x1x2x1x16x16xf32>) outs(%11 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%13 = tensor.empty(%7) : tensor<2x1x17x?xf32>
%14 = scf.forall (%arg3, %arg4, %arg5, %arg6) = (0, 0, 0, 0) to (2, 1, 17, %7) step (1, 1, 1, 4) shared_outs(%arg7 = %13) -> (tensor<2x1x17x?xf32>) {
%15 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg6)[%7]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%16 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg7[%arg3, 0, %arg5, %arg6] [1, 1, 1, %15] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg2[0, %arg0, 0, %arg1] [2, 1, 17, %7] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After NormalizeLoopBoundsPass (iree-codegen-normalize-loop-bounds) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.apply affine_map<(d0) -> (d0)>(%arg0)
%9 = affine.min affine_map<(d0) -> (-d0 + 1281, 16)>(%7)
%10 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%11 = linalg.fill ins(%cst_0 : f32) outs(%10 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%12 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %11) -> (tensor<2x1x2x1x16x16xf32>) {
%18 = tensor.empty() : tensor<2x1x32x16xf16>
%19 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 32, 4) shared_outs(%arg9 = %18) -> (tensor<2x1x32x16xf16>) {
%23 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg8)
%24 = affine.apply affine_map<(d0) -> (d0)>(%arg7)
%25 = affine.apply affine_map<(d0) -> (d0)>(%arg6)
%26 = affine.apply affine_map<(d0) -> (d0)>(%arg5)
%27 = affine.min affine_map<(d0) -> (2, d0)>(%26)
%28 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%27)
%29 = arith.cmpi eq, %28, %c0 : index
%30 = affine.min affine_map<(d0) -> (17, d0)>(%24)
%31 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%30)
%32 = arith.cmpi eq, %31, %c0 : index
%33 = arith.ori %32, %29 : i1
%34 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%31)
%35 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%23]
%36 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%35)
%37 = arith.cmpi eq, %36, %c0 : index
%38 = arith.ori %37, %33 : i1
%39 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%36)
%40 = scf.if %38 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_2 = tensor.extract_slice %3[%27, 0, 0, 0] [%28, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%42 = tensor.empty(%28, %31, %36) : tensor<?x1x?x?xf16>
%43 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%8, %30] * [17, 1] k_offset = [%35] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%42 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %43 low[0, 0, 0, 0] high[0, 0, %34, %39] {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice = tensor.extract_slice %arg9[%26, 0, %24, %23] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%41 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%40 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %41 into %arg9[%26, 0, %24, %23] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded = tensor.expand_shape %19 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%20 = tensor.empty() : tensor<16x16xf16>
%21 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %20) -> (tensor<16x16xf16>) {
%23 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
%24 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%23)
%25 = arith.cmpi eq, %24, %c0 : index
%26 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%24)
%27 = affine.min affine_map<(d0, d1) -> (-d1 + 1281, 16, d0)>(%arg6, %7)
%28 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%9, %27)
%29 = arith.cmpi eq, %28, %c0 : index
%30 = arith.ori %29, %25 : i1
%31 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%28)
%32 = scf.if %30 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%34 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%27, %7]
%extracted_slice_2 = tensor.extract_slice %5[%23, %34] [%24, %28] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_2 low[0, 0] high[%26, %31] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%33 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%32 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %33 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded_1 = tensor.expand_shape %21 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%22 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
%23 = scf.forall (%arg10) in (64) shared_outs(%arg11 = %extracted_slice) -> (tensor<1x1x1x1x16x16xf32>) {
%24 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%25 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
%extracted_slice_2 = tensor.extract_slice %expanded[%arg5, %arg6, %arg7, %24, 0, %25] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%26 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%26 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%27 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
%28 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%extracted_slice_4 = tensor.extract_slice %expanded_1[0, %27, %arg8, %28] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%29 = tensor.empty() : tensor<1x1x1x4xf16>
%transposed_5 = linalg.transpose ins(%extracted_slice_4 : tensor<1x4x1x1xf16>) outs(%29 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%30 = affine.apply affine_map<(d0) -> (((d0 floordiv 16) mod 4) * 4)>(%arg10)
%31 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%extracted_slice_6 = tensor.extract_slice %arg11[0, 0, 0, 0, %30, %31] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> to tensor<1x1x1x1x4x1xf32>
%32 = iree_gpu.multi_mma %transposed_3, %transposed_5, %extracted_slice_6 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %32 into %arg11[0, 0, 0, 0, %30, %31] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x1xf32> into tensor<1x1x1x1x16x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %23 into %arg9[%arg5, %arg6, %arg7, %arg8, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
scf.yield %22 : tensor<2x1x2x1x16x16xf32>
}
%13 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%12 : tensor<2x1x2x1x16x16xf32>) outs(%13 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%15 = tensor.empty(%9) : tensor<2x1x17x?xf32>
%16 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%9)
%17 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %16) shared_outs(%arg7 = %15) -> (tensor<2x1x17x?xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%19 = affine.apply affine_map<(d0) -> (d0)>(%arg5)
%20 = affine.apply affine_map<(d0) -> (d0)>(%arg4)
%21 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%18)[%9]
%extracted_slice = tensor.extract_slice %collapsed[%21, 0, %19, %18] [1, 1, 1, %22] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%21, 0, %19, %18] [1, 1, 1, %22] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%23 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %23 into %arg7[%21, 0, %19, %18] [1, 1, 1, %22] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg2[0, %8, 0, %7] [2, 1, 17, %9] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%11 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %10) -> (tensor<2x1x2x1x16x16xf32>) {
%17 = tensor.empty() : tensor<2x1x32x16xf16>
%18 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 32, 4) shared_outs(%arg9 = %17) -> (tensor<2x1x32x16xf16>) {
%22 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg8)
%23 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
%24 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%23)
%25 = arith.cmpi eq, %24, %c0 : index
%26 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
%27 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %25 : i1
%30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
%31 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg3, %arg8)
%32 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%31)
%33 = arith.cmpi eq, %32, %c0 : index
%34 = arith.ori %33, %29 : i1
%35 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%32)
%36 = scf.if %34 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_2 = tensor.extract_slice %3[%23, 0, 0, 0] [%24, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%38 = tensor.empty(%24, %27, %32) : tensor<?x1x?x?xf16>
%39 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %26] * [17, 1] k_offset = [%31] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%38 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %39 low[0, 0, 0, 0] high[0, 0, %30, %35] {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%37 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%36 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %37 into %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%19 = tensor.empty() : tensor<16x16xf16>
%20 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %19) -> (tensor<16x16xf16>) {
%22 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
%23 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%22)
%24 = arith.cmpi eq, %23, %c0 : index
%25 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%23)
%26 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%arg6, %arg1)
%27 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %24 : i1
%30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
%31 = scf.if %29 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%33 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%26]
%extracted_slice_2 = tensor.extract_slice %5[%22, %33] [%23, %27] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_2 low[0, 0] high[%25, %30] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%32 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%31 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %32 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded_1 = tensor.expand_shape %20 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%21 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
%22 = scf.forall (%arg10) in (64) shared_outs(%arg11 = %extracted_slice) -> (tensor<1x1x1x1x16x16xf32>) {
%23 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
%extracted_slice_2 = tensor.extract_slice %expanded[%arg5, 0, %arg7, %23, 0, %24] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%25 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%25 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%26 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
%27 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%extracted_slice_4 = tensor.extract_slice %expanded_1[0, %26, 0, %27] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%28 = tensor.empty() : tensor<1x1x1x4xf16>
%transposed_5 = linalg.transpose ins(%extracted_slice_4 : tensor<1x4x1x1xf16>) outs(%28 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%29 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
%30 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%extracted_slice_6 = tensor.extract_slice %arg11[0, 0, 0, 0, %29, %30] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> to tensor<1x1x1x1x4x1xf32>
%31 = iree_gpu.multi_mma %transposed_3, %transposed_5, %extracted_slice_6 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %31 into %arg11[0, 0, 0, 0, %29, %30] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x1xf32> into tensor<1x1x1x1x16x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %22 into %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
scf.yield %21 : tensor<2x1x2x1x16x16xf32>
}
%12 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%11 : tensor<2x1x2x1x16x16xf32>) outs(%12 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%11 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %10) -> (tensor<2x1x2x1x16x16xf32>) {
%17 = tensor.empty() : tensor<2x1x32x16xf16>
%18 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 32, 4) shared_outs(%arg9 = %17) -> (tensor<2x1x32x16xf16>) {
%22 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg8)
%23 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
%24 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%23)
%25 = arith.cmpi eq, %24, %c0 : index
%26 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
%27 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %25 : i1
%30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
%31 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg3, %arg8)
%32 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%31)
%33 = arith.cmpi eq, %32, %c0 : index
%34 = arith.ori %33, %29 : i1
%35 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%32)
%36 = scf.if %34 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_2 = tensor.extract_slice %3[%23, 0, 0, 0] [%24, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%38 = tensor.empty(%24, %27, %32) : tensor<?x1x?x?xf16>
%39 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %26] * [17, 1] k_offset = [%31] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%38 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %39 low[0, 0, 0, 0] high[0, 0, %30, %35] {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%37 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%36 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %37 into %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%19 = tensor.empty() : tensor<16x16xf16>
%20 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %19) -> (tensor<16x16xf16>) {
%22 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
%23 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%22)
%24 = arith.cmpi eq, %23, %c0 : index
%25 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%23)
%26 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%arg6, %arg1)
%27 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %24 : i1
%30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
%31 = scf.if %29 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%33 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%26]
%extracted_slice_2 = tensor.extract_slice %5[%22, %33] [%23, %27] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_2 low[0, 0] high[%25, %30] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%32 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%31 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %32 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded_1 = tensor.expand_shape %20 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%21 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
%22 = scf.forall (%arg10) in (64) shared_outs(%arg11 = %extracted_slice) -> (tensor<1x1x1x1x16x16xf32>) {
%23 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
%extracted_slice_2 = tensor.extract_slice %expanded[%arg5, 0, %arg7, %23, 0, %24] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%25 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%25 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%extracted_slice_4 = tensor.extract_slice %expanded_1[0, %24, 0, %23] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%26 = tensor.empty() : tensor<1x1x1x4xf16>
%transposed_5 = linalg.transpose ins(%extracted_slice_4 : tensor<1x4x1x1xf16>) outs(%26 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%extracted_slice_6 = tensor.extract_slice %arg11[0, 0, 0, 0, %24, %23] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> to tensor<1x1x1x1x4x1xf32>
%27 = iree_gpu.multi_mma %transposed_3, %transposed_5, %extracted_slice_6 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %27 into %arg11[0, 0, 0, 0, %24, %23] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x1xf32> into tensor<1x1x1x1x16x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %22 into %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
scf.yield %21 : tensor<2x1x2x1x16x16xf32>
}
%12 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%11 : tensor<2x1x2x1x16x16xf32>) outs(%12 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%11 = tensor.empty() : tensor<2x1x32x16xf16>
%12 = tensor.empty() : tensor<16x16xf16>
%13 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %10) -> (tensor<2x1x2x1x16x16xf32>) {
%19 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 32, 4) shared_outs(%arg9 = %11) -> (tensor<2x1x32x16xf16>) {
%22 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg8)
%23 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
%24 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%23)
%25 = arith.cmpi eq, %24, %c0 : index
%26 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
%27 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %25 : i1
%30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
%31 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg3, %arg8)
%32 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%31)
%33 = arith.cmpi eq, %32, %c0 : index
%34 = arith.ori %33, %29 : i1
%35 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%32)
%36 = scf.if %34 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_2 = tensor.extract_slice %3[%23, 0, 0, 0] [%24, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%38 = tensor.empty(%24, %27, %32) : tensor<?x1x?x?xf16>
%39 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %26] * [17, 1] k_offset = [%31] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%38 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %39 low[0, 0, 0, 0] high[0, 0, %30, %35] {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%37 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%36 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %37 into %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded = tensor.expand_shape %19 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%20 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %12) -> (tensor<16x16xf16>) {
%22 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
%23 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%22)
%24 = arith.cmpi eq, %23, %c0 : index
%25 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%23)
%26 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%arg6, %arg1)
%27 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %24 : i1
%30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
%31 = scf.if %29 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%33 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%26]
%extracted_slice_2 = tensor.extract_slice %5[%22, %33] [%23, %27] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_2 low[0, 0] high[%25, %30] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%32 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%31 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %32 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded_1 = tensor.expand_shape %20 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%21 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
%22 = scf.forall (%arg10) in (64) shared_outs(%arg11 = %extracted_slice) -> (tensor<1x1x1x1x16x16xf32>) {
%23 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
%extracted_slice_2 = tensor.extract_slice %expanded[%arg5, 0, %arg7, %23, 0, %24] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%25 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%25 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%extracted_slice_4 = tensor.extract_slice %expanded_1[0, %24, 0, %23] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%26 = tensor.empty() : tensor<1x1x1x4xf16>
%transposed_5 = linalg.transpose ins(%extracted_slice_4 : tensor<1x4x1x1xf16>) outs(%26 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%extracted_slice_6 = tensor.extract_slice %arg11[0, 0, 0, 0, %24, %23] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> to tensor<1x1x1x1x4x1xf32>
%27 = iree_gpu.multi_mma %transposed_3, %transposed_5, %extracted_slice_6 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %27 into %arg11[0, 0, 0, 0, %24, %23] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x1xf32> into tensor<1x1x1x1x16x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %22 into %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
scf.yield %21 : tensor<2x1x2x1x16x16xf32>
}
%14 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%13 : tensor<2x1x2x1x16x16xf32>) outs(%14 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%16 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%17 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%18 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %17) shared_outs(%arg7 = %16) -> (tensor<2x1x17x?xf32>) {
%19 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%20 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %19] [1, 1, 1, %20] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %19] [1, 1, 1, %20] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %21 into %arg7[%arg3, 0, %arg5, %19] [1, 1, 1, %20] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = tensor.empty() : tensor<2x1x2x1x16x16xf32>
%10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<2x1x2x1x16x16xf32>) -> tensor<2x1x2x1x16x16xf32>
%11 = tensor.empty() : tensor<2x1x32x16xf16>
%12 = tensor.empty() : tensor<16x16xf16>
%13 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %10) -> (tensor<2x1x2x1x16x16xf32>) {
%19 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 32, 4) shared_outs(%arg9 = %11) -> (tensor<2x1x32x16xf16>) {
%22 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg8)
%23 = affine.min affine_map<(d0) -> (2, d0)>(%arg5)
%24 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%23)
%25 = arith.cmpi eq, %24, %c0 : index
%26 = affine.min affine_map<(d0) -> (17, d0)>(%arg7)
%27 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %25 : i1
%30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
%31 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg3, %arg8)
%32 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%31)
%33 = arith.cmpi eq, %32, %c0 : index
%34 = arith.ori %33, %29 : i1
%35 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%32)
%36 = scf.if %34 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_2 = tensor.extract_slice %3[%23, 0, 0, 0] [%24, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%38 = tensor.empty(%24, %27, %32) : tensor<?x1x?x?xf16>
%39 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %26] * [17, 1] k_offset = [%31] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_2 : tensor<?x35x35x1281xf16>) outs(%38 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %39 low[0, 0, 0, 0] high[0, 0, %30, %35] {
^bb0(%arg10: index, %arg11: index, %arg12: index, %arg13: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%37 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%36 : tensor<1x1x1x4xf16>) outs(%extracted_slice : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %37 into %arg9[%arg5, 0, %arg7, %22] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded = tensor.expand_shape %19 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%20 = scf.forall (%arg5, %arg6) in (16, 16) shared_outs(%arg7 = %12) -> (tensor<16x16xf16>) {
%extracted_slice = tensor.extract_slice %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%22 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%arg5]
%23 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%22)
%24 = arith.cmpi eq, %23, %c0 : index
%25 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%23)
%26 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%arg6, %arg1)
%27 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %24 : i1
%30 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%27)
%31 = scf.if %29 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%33 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%26]
%extracted_slice_2 = tensor.extract_slice %5[%22, %33] [%23, %27] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_2 low[0, 0] high[%25, %30] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%32 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%31 : tensor<1x1xf16>) outs(%extracted_slice : tensor<1x1xf16>) -> tensor<1x1xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %32 into %arg7[%arg5, %arg6] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%expanded_1 = tensor.expand_shape %20 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%21 = scf.forall (%arg5, %arg6, %arg7, %arg8) in (2, 1, 2, 1) shared_outs(%arg9 = %arg4) -> (tensor<2x1x2x1x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x1x16x16xf32> to tensor<1x1x1x1x16x16xf32>
%22 = scf.forall (%arg10) in (64) shared_outs(%arg11 = %extracted_slice) -> (tensor<1x1x1x1x16x16xf32>) {
%23 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg10)
%24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg10)
%extracted_slice_2 = tensor.extract_slice %expanded[%arg5, 0, %arg7, %23, 0, %24] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%25 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%25 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%extracted_slice_4 = tensor.extract_slice %expanded_1[0, %24, 0, %23] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%26 = tensor.empty() : tensor<1x1x1x4xf16>
%transposed_5 = linalg.transpose ins(%extracted_slice_4 : tensor<1x4x1x1xf16>) outs(%26 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%extracted_slice_6 = tensor.extract_slice %arg11[0, 0, 0, 0, %24, %23] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> to tensor<1x1x1x1x4x1xf32>
%27 = iree_gpu.multi_mma %transposed_3, %transposed_5, %extracted_slice_6 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %27 into %arg11[0, 0, 0, 0, %24, %23] [1, 1, 1, 1, 4, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x1xf32> into tensor<1x1x1x1x16x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %22 into %arg9[%arg5, 0, %arg7, 0, 0, 0] [1, 1, 1, 1, 16, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x16x16xf32> into tensor<2x1x2x1x16x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
scf.yield %21 : tensor<2x1x2x1x16x16xf32>
}
%14 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%transposed = linalg.transpose ins(%13 : tensor<2x1x2x1x16x16xf32>) outs(%14 : tensor<2x1x2x16x1x16xf32>) permutation = [0, 1, 2, 4, 3, 5]
%15 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %transposed [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%16 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%17 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%18 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %17) shared_outs(%arg7 = %16) -> (tensor<2x1x17x?xf32>) {
%19 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%20 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %19] [1, 1, 1, %20] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %19] [1, 1, 1, %20] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %21 into %arg7[%arg3, 0, %arg5, %19] [1, 1, 1, %20] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After GPUFuseAndHoistParallelLoopsPass (iree-codegen-gpu-fuse-and-hoist-parallel-loops) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c256 = arith.constant 256 : index
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
%21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
%22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
%25 = iree_gpu.barrier_region ins(%9 : tensor<2x1x32x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>):
%30 = scf.for %arg13 = %c0 to %c256 step %c256 iter_args(%arg14 = %arg12) -> (tensor<2x1x32x16xf16>) {
%31 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 + d1 + d2 * 64 + d3 * 64 + d4 * 128 + d5 * 128)>(%arg13, %arg8, %c0, %arg5, %arg3, %c0)
%32:4 = affine.delinearize_index %31 into (2, 1, 32, 4) : index, index, index, index
%33 = affine.apply affine_map<(d0) -> (d0 * 4)>(%32#3)
%34 = affine.min affine_map<(d0) -> (2, d0)>(%32#0)
%35 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = affine.min affine_map<(d0) -> (17, d0)>(%32#2)
%38 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%37)
%39 = arith.cmpi eq, %38, %c0 : index
%40 = arith.ori %39, %36 : i1
%41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
%42 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %32#3)
%43 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%42)
%44 = arith.cmpi eq, %43, %c0 : index
%45 = arith.ori %44, %40 : i1
%46 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%43)
%47 = scf.if %45 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg15: index, %arg16: index, %arg17: index, %arg18: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_8 = tensor.extract_slice %3[%34, 0, 0, 0] [%35, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%49 = tensor.empty(%35, %38, %43) : tensor<?x1x?x?xf16>
%50 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %37] * [17, 1] k_offset = [%42] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_8 : tensor<?x35x35x1281xf16>) outs(%49 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %50 low[0, 0, 0, 0] high[0, 0, %41, %46] {
^bb0(%arg15: index, %arg16: index, %arg17: index, %arg18: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_7 = tensor.extract_slice %arg14[%32#0, 0, %32#2, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%48 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%47 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %48 into %arg14[%32#0, 0, %32#2, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
scf.yield %inserted_slice : tensor<2x1x32x16xf16>
} {unroll_loop}
iree_gpu.yield %30 : tensor<2x1x32x16xf16>
} : tensor<2x1x32x16xf16>
%26 = iree_gpu.barrier_region ins(%10 : tensor<16x16xf16>) {
^bb0(%arg12: tensor<16x16xf16>):
%30 = scf.for %arg13 = %c0 to %c256 step %c256 iter_args(%arg14 = %arg12) -> (tensor<16x16xf16>) {
%31 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 + d1 + d2 * 64 + d3 * 64 + d4 * 128 + d5 * 128)>(%arg13, %arg8, %c0, %arg5, %arg3, %c0)
%32:2 = affine.delinearize_index %31 into (16, 16) : index, index
%extracted_slice_7 = tensor.extract_slice %arg14[%32#0, %32#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%33 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%32#0]
%34 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%33)
%35 = arith.cmpi eq, %34, %c0 : index
%36 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%34)
%37 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%32#1, %arg1)
%38 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %37)
%39 = arith.cmpi eq, %38, %c0 : index
%40 = arith.ori %39, %35 : i1
%41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
%42 = scf.if %40 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg15: index, %arg16: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%44 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%37]
%extracted_slice_8 = tensor.extract_slice %5[%33, %44] [%34, %38] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_8 low[0, 0] high[%36, %41] {
^bb0(%arg15: index, %arg16: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%43 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%42 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice = tensor.insert_slice %43 into %arg14[%32#0, %32#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
scf.yield %inserted_slice : tensor<16x16xf16>
} {unroll_loop}
iree_gpu.yield %30 : tensor<16x16xf16>
} : tensor<16x16xf16>
%expanded = tensor.expand_shape %25 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%27 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%27 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%expanded_4 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%28 = tensor.empty() : tensor<1x1x1x4xf16>
%transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%28 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%29 = iree_gpu.multi_mma %transposed_3, %transposed_6, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
scf.yield %29 : tensor<1x1x1x1x4x1xf32>
}
%23 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%24 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%transposed = linalg.transpose ins(%22 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5]
scf.forall.in_parallel {
tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After GPUGreedilyDistributeToThreadsPass (iree-codegen-gpu-greedily-distribute-to-threads) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c256 = arith.constant 256 : index
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
%21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
%22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
%25 = iree_gpu.barrier_region ins(%9 : tensor<2x1x32x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>):
%30 = scf.for %arg13 = %c0 to %c256 step %c256 iter_args(%arg14 = %arg12) -> (tensor<2x1x32x16xf16>) {
%31 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 + d1 + d2 * 64 + d3 * 64 + d4 * 128 + d5 * 128)>(%arg13, %arg8, %c0, %arg5, %arg3, %c0)
%32:4 = affine.delinearize_index %31 into (2, 1, 32, 4) : index, index, index, index
%33 = affine.apply affine_map<(d0) -> (d0 * 4)>(%32#3)
%34 = affine.min affine_map<(d0) -> (2, d0)>(%32#0)
%35 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = affine.min affine_map<(d0) -> (17, d0)>(%32#2)
%38 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%37)
%39 = arith.cmpi eq, %38, %c0 : index
%40 = arith.ori %39, %36 : i1
%41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
%42 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %32#3)
%43 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%42)
%44 = arith.cmpi eq, %43, %c0 : index
%45 = arith.ori %44, %40 : i1
%46 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%43)
%47 = scf.if %45 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg15: index, %arg16: index, %arg17: index, %arg18: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_8 = tensor.extract_slice %3[%34, 0, 0, 0] [%35, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%49 = tensor.empty(%35, %38, %43) : tensor<?x1x?x?xf16>
%50 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %37] * [17, 1] k_offset = [%42] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_8 : tensor<?x35x35x1281xf16>) outs(%49 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %50 low[0, 0, 0, 0] high[0, 0, %41, %46] {
^bb0(%arg15: index, %arg16: index, %arg17: index, %arg18: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_7 = tensor.extract_slice %arg14[%32#0, 0, %32#2, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%48 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%47 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %48 into %arg14[%32#0, 0, %32#2, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
scf.yield %inserted_slice : tensor<2x1x32x16xf16>
} {unroll_loop}
iree_gpu.yield %30 : tensor<2x1x32x16xf16>
} : tensor<2x1x32x16xf16>
%26 = iree_gpu.barrier_region ins(%10 : tensor<16x16xf16>) {
^bb0(%arg12: tensor<16x16xf16>):
%30 = scf.for %arg13 = %c0 to %c256 step %c256 iter_args(%arg14 = %arg12) -> (tensor<16x16xf16>) {
%31 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 + d1 + d2 * 64 + d3 * 64 + d4 * 128 + d5 * 128)>(%arg13, %arg8, %c0, %arg5, %arg3, %c0)
%32:2 = affine.delinearize_index %31 into (16, 16) : index, index
%extracted_slice_7 = tensor.extract_slice %arg14[%32#0, %32#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%33 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%32#0]
%34 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%33)
%35 = arith.cmpi eq, %34, %c0 : index
%36 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%34)
%37 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%32#1, %arg1)
%38 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %37)
%39 = arith.cmpi eq, %38, %c0 : index
%40 = arith.ori %39, %35 : i1
%41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
%42 = scf.if %40 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg15: index, %arg16: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%44 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%37]
%extracted_slice_8 = tensor.extract_slice %5[%33, %44] [%34, %38] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_8 low[0, 0] high[%36, %41] {
^bb0(%arg15: index, %arg16: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%43 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%42 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice = tensor.insert_slice %43 into %arg14[%32#0, %32#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
scf.yield %inserted_slice : tensor<16x16xf16>
} {unroll_loop}
iree_gpu.yield %30 : tensor<16x16xf16>
} : tensor<16x16xf16>
%expanded = tensor.expand_shape %25 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%27 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%27 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%expanded_4 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%28 = tensor.empty() : tensor<1x1x1x4xf16>
%transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%28 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%29 = iree_gpu.multi_mma %transposed_3, %transposed_6, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
scf.yield %29 : tensor<1x1x1x1x4x1xf32>
}
%23 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%24 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%transposed = linalg.transpose ins(%22 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5]
scf.forall.in_parallel {
tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x?xf32>) outs(%extracted_slice_1 : tensor<1x1x1x?xf32>) -> tensor<1x1x1x?xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After TileLargeTensorsPass (iree-codegen-tile-large-tensors) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c256 = arith.constant 256 : index
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
%21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
%22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
%25 = iree_gpu.barrier_region ins(%9 : tensor<2x1x32x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>):
%30 = scf.for %arg13 = %c0 to %c256 step %c256 iter_args(%arg14 = %arg12) -> (tensor<2x1x32x16xf16>) {
%31 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 + d1 + d2 * 64 + d3 * 64 + d4 * 128 + d5 * 128)>(%arg13, %arg8, %c0, %arg5, %arg3, %c0)
%32:4 = affine.delinearize_index %31 into (2, 1, 32, 4) : index, index, index, index
%33 = affine.apply affine_map<(d0) -> (d0 * 4)>(%32#3)
%34 = affine.min affine_map<(d0) -> (2, d0)>(%32#0)
%35 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = affine.min affine_map<(d0) -> (17, d0)>(%32#2)
%38 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%37)
%39 = arith.cmpi eq, %38, %c0 : index
%40 = arith.ori %39, %36 : i1
%41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
%42 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %32#3)
%43 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%42)
%44 = arith.cmpi eq, %43, %c0 : index
%45 = arith.ori %44, %40 : i1
%46 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%43)
%47 = scf.if %45 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg15: index, %arg16: index, %arg17: index, %arg18: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_8 = tensor.extract_slice %3[%34, 0, 0, 0] [%35, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%49 = tensor.empty(%35, %38, %43) : tensor<?x1x?x?xf16>
%50 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %37] * [17, 1] k_offset = [%42] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_8 : tensor<?x35x35x1281xf16>) outs(%49 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %50 low[0, 0, 0, 0] high[0, 0, %41, %46] {
^bb0(%arg15: index, %arg16: index, %arg17: index, %arg18: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_7 = tensor.extract_slice %arg14[%32#0, 0, %32#2, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%48 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%47 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %48 into %arg14[%32#0, 0, %32#2, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
scf.yield %inserted_slice : tensor<2x1x32x16xf16>
} {unroll_loop}
iree_gpu.yield %30 : tensor<2x1x32x16xf16>
} : tensor<2x1x32x16xf16>
%26 = iree_gpu.barrier_region ins(%10 : tensor<16x16xf16>) {
^bb0(%arg12: tensor<16x16xf16>):
%30 = scf.for %arg13 = %c0 to %c256 step %c256 iter_args(%arg14 = %arg12) -> (tensor<16x16xf16>) {
%31 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 + d1 + d2 * 64 + d3 * 64 + d4 * 128 + d5 * 128)>(%arg13, %arg8, %c0, %arg5, %arg3, %c0)
%32:2 = affine.delinearize_index %31 into (16, 16) : index, index
%extracted_slice_7 = tensor.extract_slice %arg14[%32#0, %32#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%33 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%32#0]
%34 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%33)
%35 = arith.cmpi eq, %34, %c0 : index
%36 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%34)
%37 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%32#1, %arg1)
%38 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %37)
%39 = arith.cmpi eq, %38, %c0 : index
%40 = arith.ori %39, %35 : i1
%41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
%42 = scf.if %40 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg15: index, %arg16: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%44 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%37]
%extracted_slice_8 = tensor.extract_slice %5[%33, %44] [%34, %38] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_8 low[0, 0] high[%36, %41] {
^bb0(%arg15: index, %arg16: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%43 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%42 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice = tensor.insert_slice %43 into %arg14[%32#0, %32#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
scf.yield %inserted_slice : tensor<16x16xf16>
} {unroll_loop}
iree_gpu.yield %30 : tensor<16x16xf16>
} : tensor<16x16xf16>
%expanded = tensor.expand_shape %25 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%27 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%27 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%expanded_4 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%28 = tensor.empty() : tensor<1x1x1x4xf16>
%transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%28 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%29 = iree_gpu.multi_mma %transposed_3, %transposed_6, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
scf.yield %29 : tensor<1x1x1x1x4x1xf32>
}
%23 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%24 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%transposed = linalg.transpose ins(%22 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5]
scf.forall.in_parallel {
tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%c3 = arith.constant 3 : index
%c3_2 = arith.constant 3 : index
%c0_3 = arith.constant 0 : index
%c0_4 = arith.constant 0 : index
%c0_5 = arith.constant 0 : index
%c0_6 = arith.constant 0 : index
%c1_7 = arith.constant 1 : index
%c1_8 = arith.constant 1 : index
%c1_9 = arith.constant 1 : index
%c1_10 = arith.constant 1 : index
%c1_11 = arith.constant 1 : index
%c1_12 = arith.constant 1 : index
%c1_13 = arith.constant 1 : index
%19 = scf.for %arg8 = %c0_3 to %c1_7 step %c1_10 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
%20 = scf.for %arg10 = %c0_4 to %c1_8 step %c1_11 iter_args(%arg11 = %arg9) -> (tensor<1x1x1x?xf32>) {
%21 = scf.for %arg12 = %c0_5 to %c1_9 step %c1_12 iter_args(%arg13 = %arg11) -> (tensor<1x1x1x?xf32>) {
%22 = scf.for %arg14 = %c0_6 to %18 step %c1_13 iter_args(%arg15 = %arg13) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_14 = tensor.extract_slice %extracted_slice[%arg8, %arg10, %arg12, %arg14] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_15 = tensor.extract_slice %arg15[%arg8, %arg10, %arg12, %arg14] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%23 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_14 : tensor<1x1x1x1xf32>) outs(%extracted_slice_15 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %23 into %arg15[%arg8, %arg10, %arg12, %arg14] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.yield %22 : tensor<1x1x1x?xf32>
}
scf.yield %21 : tensor<1x1x1x?xf32>
}
scf.yield %20 : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
%21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
%22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
%25 = iree_gpu.barrier_region ins(%9 : tensor<2x1x32x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>):
%30 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%31:3 = affine.delinearize_index %30 into (2, 32, 4) : index, index, index
%32 = affine.apply affine_map<(d0) -> (d0 * 4)>(%31#2)
%33 = affine.min affine_map<(d0) -> (2, d0)>(%31#0)
%34 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%33)
%35 = arith.cmpi eq, %34, %c0 : index
%36 = affine.min affine_map<(d0) -> (17, d0)>(%31#1)
%37 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%36)
%38 = arith.cmpi eq, %37, %c0 : index
%39 = arith.ori %38, %35 : i1
%40 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%37)
%41 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %31#2)
%42 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%41)
%43 = arith.cmpi eq, %42, %c0 : index
%44 = arith.ori %43, %39 : i1
%45 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%42)
%46 = scf.if %44 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg13: index, %arg14: index, %arg15: index, %arg16: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_8 = tensor.extract_slice %3[%33, 0, 0, 0] [%34, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%48 = tensor.empty(%34, %37, %42) : tensor<?x1x?x?xf16>
%49 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %36] * [17, 1] k_offset = [%41] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_8 : tensor<?x35x35x1281xf16>) outs(%48 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %49 low[0, 0, 0, 0] high[0, 0, %40, %45] {
^bb0(%arg13: index, %arg14: index, %arg15: index, %arg16: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_7 = tensor.extract_slice %arg12[%31#0, 0, %31#1, %32] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%47 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%46 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %47 into %arg12[%31#0, 0, %31#1, %32] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
iree_gpu.yield %inserted_slice : tensor<2x1x32x16xf16>
} : tensor<2x1x32x16xf16>
%26 = iree_gpu.barrier_region ins(%10 : tensor<16x16xf16>) {
^bb0(%arg12: tensor<16x16xf16>):
%30 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%31:2 = affine.delinearize_index %30 into (16, 16) : index, index
%extracted_slice_7 = tensor.extract_slice %arg12[%31#0, %31#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%32 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%31#0]
%33 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%32)
%34 = arith.cmpi eq, %33, %c0 : index
%35 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%33)
%36 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%31#1, %arg1)
%37 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %36)
%38 = arith.cmpi eq, %37, %c0 : index
%39 = arith.ori %38, %34 : i1
%40 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%37)
%41 = scf.if %39 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg13: index, %arg14: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%43 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%36]
%extracted_slice_8 = tensor.extract_slice %5[%32, %43] [%33, %37] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_8 low[0, 0] high[%35, %40] {
^bb0(%arg13: index, %arg14: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%42 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%41 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice = tensor.insert_slice %42 into %arg12[%31#0, %31#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice : tensor<16x16xf16>
} : tensor<16x16xf16>
%expanded = tensor.expand_shape %25 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%27 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%27 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%expanded_4 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%28 = tensor.empty() : tensor<1x1x1x4xf16>
%transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%28 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%29 = iree_gpu.multi_mma %transposed_3, %transposed_6, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
scf.yield %29 : tensor<1x1x1x1x4x1xf32>
}
%23 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%24 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%transposed = linalg.transpose ins(%22 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5]
scf.forall.in_parallel {
tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %23, 0, %24] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
%21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
%22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
%23 = iree_gpu.barrier_region ins(%9 : tensor<2x1x32x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>):
%28 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%29:3 = affine.delinearize_index %28 into (2, 32, 4) : index, index, index
%30 = affine.apply affine_map<(d0) -> (d0 * 4)>(%29#2)
%31 = affine.min affine_map<(d0) -> (2, d0)>(%29#0)
%32 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%31)
%33 = arith.cmpi eq, %32, %c0 : index
%34 = affine.min affine_map<(d0) -> (17, d0)>(%29#1)
%35 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = arith.ori %36, %33 : i1
%38 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%35)
%39 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %29#2)
%40 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%39)
%41 = arith.cmpi eq, %40, %c0 : index
%42 = arith.ori %41, %37 : i1
%43 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%40)
%44 = scf.if %42 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg13: index, %arg14: index, %arg15: index, %arg16: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_8 = tensor.extract_slice %3[%31, 0, 0, 0] [%32, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%46 = tensor.empty(%32, %35, %40) : tensor<?x1x?x?xf16>
%47 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %34] * [17, 1] k_offset = [%39] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_8 : tensor<?x35x35x1281xf16>) outs(%46 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %47 low[0, 0, 0, 0] high[0, 0, %38, %43] {
^bb0(%arg13: index, %arg14: index, %arg15: index, %arg16: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_7 = tensor.extract_slice %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%45 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%44 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %45 into %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
iree_gpu.yield %inserted_slice : tensor<2x1x32x16xf16>
} : tensor<2x1x32x16xf16>
%24 = iree_gpu.barrier_region ins(%10 : tensor<16x16xf16>) {
^bb0(%arg12: tensor<16x16xf16>):
%28 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%29:2 = affine.delinearize_index %28 into (16, 16) : index, index
%extracted_slice_7 = tensor.extract_slice %arg12[%29#0, %29#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%30 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%29#0]
%31 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%30)
%32 = arith.cmpi eq, %31, %c0 : index
%33 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%31)
%34 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%29#1, %arg1)
%35 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = arith.ori %36, %32 : i1
%38 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%35)
%39 = scf.if %37 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg13: index, %arg14: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%41 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%34]
%extracted_slice_8 = tensor.extract_slice %5[%30, %41] [%31, %35] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_8 low[0, 0] high[%33, %38] {
^bb0(%arg13: index, %arg14: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%40 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%39 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice = tensor.insert_slice %40 into %arg12[%29#0, %29#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice : tensor<16x16xf16>
} : tensor<16x16xf16>
%expanded = tensor.expand_shape %23 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%25 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%25 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%expanded_4 = tensor.expand_shape %24 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%26 = tensor.empty() : tensor<1x1x1x4xf16>
%transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%26 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%27 = iree_gpu.multi_mma %transposed_3, %transposed_6, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
scf.yield %27 : tensor<1x1x1x1x4x1xf32>
}
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%transposed = linalg.transpose ins(%22 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5]
scf.forall.in_parallel {
tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
%21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
%22 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%23 = tensor.empty() : tensor<1x1x1x4xf16>
%24 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
%25 = iree_gpu.barrier_region ins(%9 : tensor<2x1x32x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>):
%28 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%29:3 = affine.delinearize_index %28 into (2, 32, 4) : index, index, index
%30 = affine.apply affine_map<(d0) -> (d0 * 4)>(%29#2)
%31 = affine.min affine_map<(d0) -> (2, d0)>(%29#0)
%32 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%31)
%33 = arith.cmpi eq, %32, %c0 : index
%34 = affine.min affine_map<(d0) -> (17, d0)>(%29#1)
%35 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = arith.ori %36, %33 : i1
%38 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%35)
%39 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %29#2)
%40 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%39)
%41 = arith.cmpi eq, %40, %c0 : index
%42 = arith.ori %41, %37 : i1
%43 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%40)
%44 = scf.if %42 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg13: index, %arg14: index, %arg15: index, %arg16: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_8 = tensor.extract_slice %3[%31, 0, 0, 0] [%32, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%46 = tensor.empty(%32, %35, %40) : tensor<?x1x?x?xf16>
%47 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %34] * [17, 1] k_offset = [%39] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_8 : tensor<?x35x35x1281xf16>) outs(%46 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %47 low[0, 0, 0, 0] high[0, 0, %38, %43] {
^bb0(%arg13: index, %arg14: index, %arg15: index, %arg16: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_7 = tensor.extract_slice %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%45 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%44 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %45 into %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
iree_gpu.yield %inserted_slice : tensor<2x1x32x16xf16>
} : tensor<2x1x32x16xf16>
%26 = iree_gpu.barrier_region ins(%10 : tensor<16x16xf16>) {
^bb0(%arg12: tensor<16x16xf16>):
%28 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%29:2 = affine.delinearize_index %28 into (16, 16) : index, index
%extracted_slice_7 = tensor.extract_slice %arg12[%29#0, %29#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%30 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%29#0]
%31 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%30)
%32 = arith.cmpi eq, %31, %c0 : index
%33 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%31)
%34 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%29#1, %arg1)
%35 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = arith.ori %36, %32 : i1
%38 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%35)
%39 = scf.if %37 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg13: index, %arg14: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%41 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%34]
%extracted_slice_8 = tensor.extract_slice %5[%30, %41] [%31, %35] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_8 low[0, 0] high[%33, %38] {
^bb0(%arg13: index, %arg14: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%40 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%39 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice = tensor.insert_slice %40 into %arg12[%29#0, %29#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice : tensor<16x16xf16>
} : tensor<16x16xf16>
%expanded = tensor.expand_shape %25 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%22 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%expanded_4 = tensor.expand_shape %26 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%23 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%27 = iree_gpu.multi_mma %transposed_3, %transposed_6, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
scf.yield %27 : tensor<1x1x1x1x4x1xf32>
}
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%transposed = linalg.transpose ins(%24 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5]
scf.forall.in_parallel {
tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After CombineBarrierRegionsPass (iree-gpu-combine-barrier-regions) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
%21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
%22 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%23 = tensor.empty() : tensor<1x1x1x4xf16>
%24 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
%25:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%27 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%28:3 = affine.delinearize_index %27 into (2, 32, 4) : index, index, index
%29 = affine.apply affine_map<(d0) -> (d0 * 4)>(%28#2)
%30 = affine.min affine_map<(d0) -> (2, d0)>(%28#0)
%31 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%30)
%32 = arith.cmpi eq, %31, %c0 : index
%33 = affine.min affine_map<(d0) -> (17, d0)>(%28#1)
%34 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%33)
%35 = arith.cmpi eq, %34, %c0 : index
%36 = arith.ori %35, %32 : i1
%37 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%34)
%38 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %28#2)
%39 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%38)
%40 = arith.cmpi eq, %39, %c0 : index
%41 = arith.ori %40, %36 : i1
%42 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%39)
%43 = scf.if %41 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_10 = tensor.extract_slice %3[%30, 0, 0, 0] [%31, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%58 = tensor.empty(%31, %34, %39) : tensor<?x1x?x?xf16>
%59 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %33] * [17, 1] k_offset = [%38] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_10 : tensor<?x35x35x1281xf16>) outs(%58 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %59 low[0, 0, 0, 0] high[0, 0, %37, %42] {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_7 = tensor.extract_slice %arg12[%28#0, 0, %28#1, %29] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%44 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%43 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %44 into %arg12[%28#0, 0, %28#1, %29] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%45 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%46:2 = affine.delinearize_index %45 into (16, 16) : index, index
%extracted_slice_8 = tensor.extract_slice %arg13[%46#0, %46#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%47 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%46#0]
%48 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%47)
%49 = arith.cmpi eq, %48, %c0 : index
%50 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%48)
%51 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%46#1, %arg1)
%52 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %51)
%53 = arith.cmpi eq, %52, %c0 : index
%54 = arith.ori %53, %49 : i1
%55 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%52)
%56 = scf.if %54 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%58 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%51]
%extracted_slice_10 = tensor.extract_slice %5[%47, %58] [%48, %52] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_10 low[0, 0] high[%50, %55] {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%57 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%56 : tensor<1x1xf16>) outs(%extracted_slice_8 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_9 = tensor.insert_slice %57 into %arg13[%46#0, %46#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_9 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %25#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%22 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%expanded_4 = tensor.expand_shape %25#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%23 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%26 = iree_gpu.multi_mma %transposed_3, %transposed_6, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [2, 1, 32, 16, 16], promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 1], subgroup = [1, 1, 1, 1, 0], workgroup = [2, 1, 32, 16, 0]}>} : tensor<1x1x1x1x1x4xf16>, tensor<1x1x1x4xf16> into tensor<1x1x1x1x4x1xf32>
scf.yield %26 : tensor<1x1x1x1x4x1xf32>
}
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%transposed = linalg.transpose ins(%24 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5]
scf.forall.in_parallel {
tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After VectorizeIREEGPUOpsPass (iree-gpu-vectorize-ops) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
%21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
%22 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%23 = tensor.empty() : tensor<1x1x1x4xf16>
%24 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
%25:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%31 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%32:3 = affine.delinearize_index %31 into (2, 32, 4) : index, index, index
%33 = affine.apply affine_map<(d0) -> (d0 * 4)>(%32#2)
%34 = affine.min affine_map<(d0) -> (2, d0)>(%32#0)
%35 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = affine.min affine_map<(d0) -> (17, d0)>(%32#1)
%38 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%37)
%39 = arith.cmpi eq, %38, %c0 : index
%40 = arith.ori %39, %36 : i1
%41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
%42 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %32#2)
%43 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%42)
%44 = arith.cmpi eq, %43, %c0 : index
%45 = arith.ori %44, %40 : i1
%46 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%43)
%47 = scf.if %45 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_10 = tensor.extract_slice %3[%34, 0, 0, 0] [%35, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%62 = tensor.empty(%35, %38, %43) : tensor<?x1x?x?xf16>
%63 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %37] * [17, 1] k_offset = [%42] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_10 : tensor<?x35x35x1281xf16>) outs(%62 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %63 low[0, 0, 0, 0] high[0, 0, %41, %46] {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_7 = tensor.extract_slice %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%48 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%47 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %48 into %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%49 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%50:2 = affine.delinearize_index %49 into (16, 16) : index, index
%extracted_slice_8 = tensor.extract_slice %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%51 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%50#0]
%52 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%51)
%53 = arith.cmpi eq, %52, %c0 : index
%54 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%52)
%55 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%50#1, %arg1)
%56 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %55)
%57 = arith.cmpi eq, %56, %c0 : index
%58 = arith.ori %57, %53 : i1
%59 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%56)
%60 = scf.if %58 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%62 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%55]
%extracted_slice_10 = tensor.extract_slice %5[%51, %62] [%52, %56] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_10 low[0, 0] high[%54, %59] {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%61 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%60 : tensor<1x1xf16>) outs(%extracted_slice_8 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_9 = tensor.insert_slice %61 into %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_9 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %25#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%22 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%expanded_4 = tensor.expand_shape %25#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%23 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%26 = vector.transfer_read %transposed_3[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x1x4xf16>, vector<1x1x1x1x1x4xf16>
%27 = vector.transfer_read %transposed_6[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x1x4xf16>, vector<1x1x1x4xf16>
%28 = vector.transfer_read %arg11[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
%29 = iree_gpu.multi_mma %26, %27, %28 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
%30 = vector.transfer_write %29, %arg11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
scf.yield %30 : tensor<1x1x1x1x4x1xf32>
}
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%transposed = linalg.transpose ins(%24 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5]
scf.forall.in_parallel {
tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
%21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
%22 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%23 = tensor.empty() : tensor<1x1x1x4xf16>
%24 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
%25:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%31 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%32:3 = affine.delinearize_index %31 into (2, 32, 4) : index, index, index
%33 = affine.apply affine_map<(d0) -> (d0 * 4)>(%32#2)
%34 = affine.min affine_map<(d0) -> (2, d0)>(%32#0)
%35 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = affine.min affine_map<(d0) -> (17, d0)>(%32#1)
%38 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%37)
%39 = arith.cmpi eq, %38, %c0 : index
%40 = arith.ori %39, %36 : i1
%41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
%42 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %32#2)
%43 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%42)
%44 = arith.cmpi eq, %43, %c0 : index
%45 = arith.ori %44, %40 : i1
%46 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%43)
%47 = scf.if %45 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_10 = tensor.extract_slice %3[%34, 0, 0, 0] [%35, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%62 = tensor.empty(%35, %38, %43) : tensor<?x1x?x?xf16>
%63 = iree_linalg_ext.im2col strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%arg0, %37] * [17, 1] k_offset = [%42] * [1] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%extracted_slice_10 : tensor<?x35x35x1281xf16>) outs(%62 : tensor<?x1x?x?xf16>) -> tensor<?x1x?x?xf16>
%padded = tensor.pad %63 low[0, 0, 0, 0] high[0, 0, %41, %46] {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_7 = tensor.extract_slice %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%48 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%47 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %48 into %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%49 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%50:2 = affine.delinearize_index %49 into (16, 16) : index, index
%extracted_slice_8 = tensor.extract_slice %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%51 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%50#0]
%52 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%51)
%53 = arith.cmpi eq, %52, %c0 : index
%54 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%52)
%55 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%50#1, %arg1)
%56 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %55)
%57 = arith.cmpi eq, %56, %c0 : index
%58 = arith.ori %57, %53 : i1
%59 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%56)
%60 = scf.if %58 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%62 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%55]
%extracted_slice_10 = tensor.extract_slice %5[%51, %62] [%52, %56] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_10 low[0, 0] high[%54, %59] {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%61 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%60 : tensor<1x1xf16>) outs(%extracted_slice_8 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_9 = tensor.insert_slice %61 into %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_9 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %25#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%22 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%expanded_4 = tensor.expand_shape %25#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%23 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%26 = vector.transfer_read %transposed_3[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x1x4xf16>, vector<1x1x1x1x1x4xf16>
%27 = vector.transfer_read %transposed_6[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x1x4xf16>, vector<1x1x1x4xf16>
%28 = vector.transfer_read %arg11[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
%29 = iree_gpu.multi_mma %26, %27, %28 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
%30 = vector.transfer_write %29, %arg11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
scf.yield %30 : tensor<1x1x1x1x4x1xf32>
}
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%transposed = linalg.transpose ins(%24 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5]
scf.forall.in_parallel {
tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After DecomposeIm2colPass (iree-linalg-ext-decompose-im2col) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
%21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
%22 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%23 = tensor.empty() : tensor<1x1x1x4xf16>
%24 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
%25:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%31 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%32:3 = affine.delinearize_index %31 into (2, 32, 4) : index, index, index
%33 = affine.apply affine_map<(d0) -> (d0 * 4)>(%32#2)
%34 = affine.min affine_map<(d0) -> (2, d0)>(%32#0)
%35 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = affine.min affine_map<(d0) -> (17, d0)>(%32#1)
%38 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%37)
%39 = arith.cmpi eq, %38, %c0 : index
%40 = arith.ori %39, %36 : i1
%41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
%42 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %32#2)
%43 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%42)
%44 = arith.cmpi eq, %43, %c0 : index
%45 = arith.ori %44, %40 : i1
%46 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%43)
%47 = scf.if %45 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_10 = tensor.extract_slice %3[%34, 0, 0, 0] [%35, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%62 = tensor.empty(%35, %38, %43) : tensor<?x1x?x?xf16>
%63 = scf.for %arg14 = %c0 to %35 step %c1 iter_args(%arg15 = %62) -> (tensor<?x1x?x?xf16>) {
%64 = scf.for %arg16 = %c0 to %c1 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
%65 = scf.for %arg18 = %c0 to %38 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
%66 = scf.for %arg20 = %c0 to %43 step %c1 iter_args(%arg21 = %arg19) -> (tensor<?x1x?x?xf16>) {
%67 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%42, %arg20)
%68 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (((d0 * 17 + d1 + d2 * 17 + d3) floordiv 17) * 2 + (d4 + d5) floordiv 3843)>(%arg16, %arg18, %arg0, %37, %42, %arg20)
%69 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 * 34 + d1 * 2 + d2 * 34 + d3 * 2 - ((d0 * 17 + d1 + d2 * 17 + d3) floordiv 17) * 34 + ((d4 + d5) mod 3843) floordiv 1281)>(%arg16, %arg18, %arg0, %37, %42, %arg20)
%extracted_slice_11 = tensor.extract_slice %extracted_slice_10[%arg14, %68, %69, %67] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
%extracted_slice_12 = tensor.extract_slice %arg21[%arg14, %arg16, %arg18, %arg20] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
%70 = linalg.copy ins(%extracted_slice_11 : tensor<1x1x1x1xf16>) outs(%extracted_slice_12 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
%inserted_slice_13 = tensor.insert_slice %70 into %arg21[%arg14, %arg16, %arg18, %arg20] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
scf.yield %inserted_slice_13 : tensor<?x1x?x?xf16>
}
scf.yield %66 : tensor<?x1x?x?xf16>
}
scf.yield %65 : tensor<?x1x?x?xf16>
}
scf.yield %64 : tensor<?x1x?x?xf16>
}
%padded = tensor.pad %63 low[0, 0, 0, 0] high[0, 0, %41, %46] {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_7 = tensor.extract_slice %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%48 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%47 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %48 into %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%49 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%50:2 = affine.delinearize_index %49 into (16, 16) : index, index
%extracted_slice_8 = tensor.extract_slice %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%51 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%50#0]
%52 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%51)
%53 = arith.cmpi eq, %52, %c0 : index
%54 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%52)
%55 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%50#1, %arg1)
%56 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %55)
%57 = arith.cmpi eq, %56, %c0 : index
%58 = arith.ori %57, %53 : i1
%59 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%56)
%60 = scf.if %58 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%62 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%55]
%extracted_slice_10 = tensor.extract_slice %5[%51, %62] [%52, %56] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_10 low[0, 0] high[%54, %59] {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%61 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%60 : tensor<1x1xf16>) outs(%extracted_slice_8 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_9 = tensor.insert_slice %61 into %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_9 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %25#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%22 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%expanded_4 = tensor.expand_shape %25#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%23 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%26 = vector.transfer_read %transposed_3[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x1x4xf16>, vector<1x1x1x1x1x4xf16>
%27 = vector.transfer_read %transposed_6[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x1x4xf16>, vector<1x1x1x4xf16>
%28 = vector.transfer_read %arg11[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
%29 = iree_gpu.multi_mma %26, %27, %28 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
%30 = vector.transfer_write %29, %arg11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
scf.yield %30 : tensor<1x1x1x1x4x1xf32>
}
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%transposed = linalg.transpose ins(%24 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5]
scf.forall.in_parallel {
tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After VectorizeIREEVectorExtOpsPass (iree-vector-ext-vectorize-ops) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
%21 = linalg.fill ins(%cst_0 : f32) outs(%20 : tensor<1x1x1x1x4x1xf32>) -> tensor<1x1x1x1x4x1xf32>
%22 = tensor.empty() : tensor<1x1x1x1x1x4xf16>
%23 = tensor.empty() : tensor<1x1x1x4xf16>
%24 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
%25:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%31 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%32:3 = affine.delinearize_index %31 into (2, 32, 4) : index, index, index
%33 = affine.apply affine_map<(d0) -> (d0 * 4)>(%32#2)
%34 = affine.min affine_map<(d0) -> (2, d0)>(%32#0)
%35 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = affine.min affine_map<(d0) -> (17, d0)>(%32#1)
%38 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%37)
%39 = arith.cmpi eq, %38, %c0 : index
%40 = arith.ori %39, %36 : i1
%41 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%38)
%42 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %32#2)
%43 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%42)
%44 = arith.cmpi eq, %43, %c0 : index
%45 = arith.ori %44, %40 : i1
%46 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%43)
%47 = scf.if %45 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_10 = tensor.extract_slice %3[%34, 0, 0, 0] [%35, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%62 = tensor.empty(%35, %38, %43) : tensor<?x1x?x?xf16>
%63 = scf.for %arg14 = %c0 to %35 step %c1 iter_args(%arg15 = %62) -> (tensor<?x1x?x?xf16>) {
%64 = scf.for %arg16 = %c0 to %c1 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
%65 = scf.for %arg18 = %c0 to %38 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
%66 = scf.for %arg20 = %c0 to %43 step %c1 iter_args(%arg21 = %arg19) -> (tensor<?x1x?x?xf16>) {
%67 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%42, %arg20)
%68 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (((d0 * 17 + d1 + d2 * 17 + d3) floordiv 17) * 2 + (d4 + d5) floordiv 3843)>(%arg16, %arg18, %arg0, %37, %42, %arg20)
%69 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 * 34 + d1 * 2 + d2 * 34 + d3 * 2 - ((d0 * 17 + d1 + d2 * 17 + d3) floordiv 17) * 34 + ((d4 + d5) mod 3843) floordiv 1281)>(%arg16, %arg18, %arg0, %37, %42, %arg20)
%extracted_slice_11 = tensor.extract_slice %extracted_slice_10[%arg14, %68, %69, %67] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
%extracted_slice_12 = tensor.extract_slice %arg21[%arg14, %arg16, %arg18, %arg20] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
%70 = linalg.copy ins(%extracted_slice_11 : tensor<1x1x1x1xf16>) outs(%extracted_slice_12 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
%inserted_slice_13 = tensor.insert_slice %70 into %arg21[%arg14, %arg16, %arg18, %arg20] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
scf.yield %inserted_slice_13 : tensor<?x1x?x?xf16>
}
scf.yield %66 : tensor<?x1x?x?xf16>
}
scf.yield %65 : tensor<?x1x?x?xf16>
}
scf.yield %64 : tensor<?x1x?x?xf16>
}
%padded = tensor.pad %63 low[0, 0, 0, 0] high[0, 0, %41, %46] {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_7 = tensor.extract_slice %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%48 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%47 : tensor<1x1x1x4xf16>) outs(%extracted_slice_7 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %48 into %arg12[%32#0, 0, %32#1, %33] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%49 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%50:2 = affine.delinearize_index %49 into (16, 16) : index, index
%extracted_slice_8 = tensor.extract_slice %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%51 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%50#0]
%52 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%51)
%53 = arith.cmpi eq, %52, %c0 : index
%54 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%52)
%55 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%50#1, %arg1)
%56 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %55)
%57 = arith.cmpi eq, %56, %c0 : index
%58 = arith.ori %57, %53 : i1
%59 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%56)
%60 = scf.if %58 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%62 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%55]
%extracted_slice_10 = tensor.extract_slice %5[%51, %62] [%52, %56] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_10 low[0, 0] high[%54, %59] {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%61 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%60 : tensor<1x1xf16>) outs(%extracted_slice_8 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_9 = tensor.insert_slice %61 into %arg13[%50#0, %50#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_9 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %25#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%extracted_slice_2 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%transposed_3 = linalg.transpose ins(%extracted_slice_2 : tensor<1x1x1x1x1x4xf16>) outs(%22 : tensor<1x1x1x1x1x4xf16>) permutation = [0, 1, 2, 4, 3, 5]
%expanded_4 = tensor.expand_shape %25#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%transposed_6 = linalg.transpose ins(%extracted_slice_5 : tensor<1x4x1x1xf16>) outs(%23 : tensor<1x1x1x4xf16>) permutation = [0, 2, 3, 1]
%26 = vector.transfer_read %transposed_3[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x1x4xf16>, vector<1x1x1x1x1x4xf16>
%27 = vector.transfer_read %transposed_6[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x1x4xf16>, vector<1x1x1x4xf16>
%28 = vector.transfer_read %arg11[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
%29 = iree_gpu.multi_mma %26, %27, %28 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
%30 = vector.transfer_write %29, %arg11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
scf.yield %30 : tensor<1x1x1x1x4x1xf32>
}
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%transposed = linalg.transpose ins(%24 : tensor<1x1x1x1x4x1xf32>) outs(%extracted_slice_1 : tensor<1x1x1x4x1x1xf32>) permutation = [0, 1, 2, 4, 3, 5]
scf.forall.in_parallel {
tensor.parallel_insert_slice %transposed into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%cst_1 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
%21 = vector.transfer_write %cst, %20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
%22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
%26:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%34 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%35:3 = affine.delinearize_index %34 into (2, 32, 4) : index, index, index
%36 = affine.apply affine_map<(d0) -> (d0 * 4)>(%35#2)
%37 = affine.min affine_map<(d0) -> (2, d0)>(%35#0)
%38 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%37)
%39 = arith.cmpi eq, %38, %c0 : index
%40 = affine.min affine_map<(d0) -> (17, d0)>(%35#1)
%41 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%40)
%42 = arith.cmpi eq, %41, %c0 : index
%43 = arith.ori %42, %39 : i1
%44 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%41)
%45 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %35#2)
%46 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%45)
%47 = arith.cmpi eq, %46, %c0 : index
%48 = arith.ori %47, %43 : i1
%49 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%46)
%50 = scf.if %48 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_9 = tensor.extract_slice %3[%37, 0, 0, 0] [%38, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%65 = tensor.empty(%38, %41, %46) : tensor<?x1x?x?xf16>
%66 = scf.for %arg14 = %c0 to %38 step %c1 iter_args(%arg15 = %65) -> (tensor<?x1x?x?xf16>) {
%67 = scf.for %arg16 = %c0 to %c1 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
%68 = scf.for %arg18 = %c0 to %41 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
%69 = scf.for %arg20 = %c0 to %46 step %c1 iter_args(%arg21 = %arg19) -> (tensor<?x1x?x?xf16>) {
%70 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%45, %arg20)
%71 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (((d0 * 17 + d1 + d2 * 17 + d3) floordiv 17) * 2 + (d4 + d5) floordiv 3843)>(%arg16, %arg18, %arg0, %40, %45, %arg20)
%72 = affine.apply affine_map<(d0, d1, d2, d3, d4, d5) -> (d0 * 34 + d1 * 2 + d2 * 34 + d3 * 2 - ((d0 * 17 + d1 + d2 * 17 + d3) floordiv 17) * 34 + ((d4 + d5) mod 3843) floordiv 1281)>(%arg16, %arg18, %arg0, %40, %45, %arg20)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_9[%arg14, %71, %72, %70] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
%extracted_slice_11 = tensor.extract_slice %arg21[%arg14, %arg16, %arg18, %arg20] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
%73 = linalg.copy ins(%extracted_slice_10 : tensor<1x1x1x1xf16>) outs(%extracted_slice_11 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
%inserted_slice_12 = tensor.insert_slice %73 into %arg21[%arg14, %arg16, %arg18, %arg20] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
scf.yield %inserted_slice_12 : tensor<?x1x?x?xf16>
}
scf.yield %69 : tensor<?x1x?x?xf16>
}
scf.yield %68 : tensor<?x1x?x?xf16>
}
scf.yield %67 : tensor<?x1x?x?xf16>
}
%padded = tensor.pad %66 low[0, 0, 0, 0] high[0, 0, %44, %49] {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_6 = tensor.extract_slice %arg12[%35#0, 0, %35#1, %36] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%51 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%50 : tensor<1x1x1x4xf16>) outs(%extracted_slice_6 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %51 into %arg12[%35#0, 0, %35#1, %36] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%52 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%53:2 = affine.delinearize_index %52 into (16, 16) : index, index
%extracted_slice_7 = tensor.extract_slice %arg13[%53#0, %53#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%54 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%53#0]
%55 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%54)
%56 = arith.cmpi eq, %55, %c0 : index
%57 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%55)
%58 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%53#1, %arg1)
%59 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %58)
%60 = arith.cmpi eq, %59, %c0 : index
%61 = arith.ori %60, %56 : i1
%62 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%59)
%63 = scf.if %61 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%65 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%58]
%extracted_slice_9 = tensor.extract_slice %5[%54, %65] [%55, %59] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_9 low[0, 0] high[%57, %62] {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%64 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%63 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_8 = tensor.insert_slice %64 into %arg13[%53#0, %53#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_8 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %26#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%extracted_slice_3 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%27 = vector.transfer_read %extracted_slice_3[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x1x4xf16>, vector<1x1x1x1x1x4xf16>
%28 = vector.transpose %27, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expanded_4 = tensor.expand_shape %26#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%29 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x4x1x1xf16>, vector<1x4x1x1xf16>
%30 = vector.transpose %29, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%31 = vector.transfer_read %arg11[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
%32 = iree_gpu.multi_mma %28, %30, %31 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
%33 = vector.transfer_write %32, %arg11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
scf.yield %33 : tensor<1x1x1x1x4x1xf32>
}
%extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%23 = vector.transfer_read %22[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
%24 = vector.transpose %23, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
%25 = vector.transfer_write %24, %extracted_slice_2[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %25 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_2 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_2) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_4 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_4 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%cst_1 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
%21 = vector.transfer_write %cst, %20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
%22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
%26:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%34 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%35:3 = affine.delinearize_index %34 into (2, 32, 4) : index, index, index
%36 = affine.apply affine_map<(d0) -> (d0 * 4)>(%35#2)
%37 = affine.min affine_map<(d0) -> (2, d0)>(%35#0)
%38 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%37)
%39 = arith.cmpi eq, %38, %c0 : index
%40 = affine.min affine_map<(d0) -> (17, d0)>(%35#1)
%41 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%40)
%42 = arith.cmpi eq, %41, %c0 : index
%43 = arith.ori %42, %39 : i1
%44 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%41)
%45 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %35#2)
%46 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%45)
%47 = arith.cmpi eq, %46, %c0 : index
%48 = arith.ori %47, %43 : i1
%49 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%46)
%50 = scf.if %48 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_9 = tensor.extract_slice %3[%37, 0, 0, 0] [%38, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%65 = tensor.empty(%38, %41, %46) : tensor<?x1x?x?xf16>
%66 = scf.for %arg14 = %c0 to %38 step %c1 iter_args(%arg15 = %65) -> (tensor<?x1x?x?xf16>) {
%67 = scf.for %arg16 = %c0 to %41 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
%68 = scf.for %arg18 = %c0 to %46 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
%69 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%45, %arg18)
%70 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %40, %45, %arg18)
%71 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %40, %45, %arg18)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_9[%arg14, %70, %71, %69] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
%extracted_slice_11 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
%72 = linalg.copy ins(%extracted_slice_10 : tensor<1x1x1x1xf16>) outs(%extracted_slice_11 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
%inserted_slice_12 = tensor.insert_slice %72 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
scf.yield %inserted_slice_12 : tensor<?x1x?x?xf16>
}
scf.yield %68 : tensor<?x1x?x?xf16>
}
scf.yield %67 : tensor<?x1x?x?xf16>
}
%padded = tensor.pad %66 low[0, 0, 0, 0] high[0, 0, %44, %49] {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_6 = tensor.extract_slice %arg12[%35#0, 0, %35#1, %36] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%51 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%50 : tensor<1x1x1x4xf16>) outs(%extracted_slice_6 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %51 into %arg12[%35#0, 0, %35#1, %36] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%52 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%53:2 = affine.delinearize_index %52 into (16, 16) : index, index
%extracted_slice_7 = tensor.extract_slice %arg13[%53#0, %53#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%54 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%53#0]
%55 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%54)
%56 = arith.cmpi eq, %55, %c0 : index
%57 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%55)
%58 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%53#1, %arg1)
%59 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %58)
%60 = arith.cmpi eq, %59, %c0 : index
%61 = arith.ori %60, %56 : i1
%62 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%59)
%63 = scf.if %61 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%65 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%58]
%extracted_slice_9 = tensor.extract_slice %5[%54, %65] [%55, %59] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_9 low[0, 0] high[%57, %62] {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%64 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%63 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_8 = tensor.insert_slice %64 into %arg13[%53#0, %53#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_8 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %26#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%extracted_slice_3 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%27 = vector.transfer_read %extracted_slice_3[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x1x4xf16>, vector<1x1x1x1x1x4xf16>
%28 = vector.transpose %27, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expanded_4 = tensor.expand_shape %26#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%29 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x4x1x1xf16>, vector<1x4x1x1xf16>
%30 = vector.transpose %29, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%31 = vector.transfer_read %arg11[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
%32 = iree_gpu.multi_mma %28, %30, %31 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
%33 = vector.transfer_write %32, %arg11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
scf.yield %33 : tensor<1x1x1x1x4x1xf32>
}
%extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%23 = vector.transfer_read %22[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
%24 = vector.transpose %23, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
%25 = vector.transfer_write %24, %extracted_slice_2[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %25 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_2 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_2) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_4 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_4 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%cst_1 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = tensor.empty() : tensor<1x1x1x1x4x1xf32>
%21 = vector.transfer_write %cst, %20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
%22 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %21) -> (tensor<1x1x1x1x4x1xf32>) {
%26:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%34 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%35:3 = affine.delinearize_index %34 into (2, 32, 4) : index, index, index
%36 = affine.apply affine_map<(d0) -> (d0 * 4)>(%35#2)
%37 = affine.min affine_map<(d0) -> (2, d0)>(%35#0)
%38 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%37)
%39 = arith.cmpi eq, %38, %c0 : index
%40 = affine.min affine_map<(d0) -> (17, d0)>(%35#1)
%41 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%40)
%42 = arith.cmpi eq, %41, %c0 : index
%43 = arith.ori %42, %39 : i1
%44 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%41)
%45 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %35#2)
%46 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%45)
%47 = arith.cmpi eq, %46, %c0 : index
%48 = arith.ori %47, %43 : i1
%49 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%46)
%50 = scf.if %48 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_9 = tensor.extract_slice %3[%37, 0, 0, 0] [%38, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%64 = tensor.empty(%38, %41, %46) : tensor<?x1x?x?xf16>
%65 = scf.for %arg14 = %c0 to %38 step %c1 iter_args(%arg15 = %64) -> (tensor<?x1x?x?xf16>) {
%66 = scf.for %arg16 = %c0 to %41 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
%67 = scf.for %arg18 = %c0 to %46 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
%68 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%45, %arg18)
%69 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %40, %45, %arg18)
%70 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %40, %45, %arg18)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_9[%arg14, %69, %70, %68] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
%extracted_slice_11 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
%71 = linalg.copy ins(%extracted_slice_10 : tensor<1x1x1x1xf16>) outs(%extracted_slice_11 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
%inserted_slice_12 = tensor.insert_slice %71 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
scf.yield %inserted_slice_12 : tensor<?x1x?x?xf16>
}
scf.yield %67 : tensor<?x1x?x?xf16>
}
scf.yield %66 : tensor<?x1x?x?xf16>
}
%padded = tensor.pad %65 low[0, 0, 0, 0] high[0, 0, %44, %49] {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_6 = tensor.extract_slice %arg12[%35#0, 0, %35#1, %36] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%51 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%50 : tensor<1x1x1x4xf16>) outs(%extracted_slice_6 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %51 into %arg12[%35#0, 0, %35#1, %36] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%52:2 = affine.delinearize_index %34 into (16, 16) : index, index
%extracted_slice_7 = tensor.extract_slice %arg13[%52#0, %52#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%53 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%52#0]
%54 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%53)
%55 = arith.cmpi eq, %54, %c0 : index
%56 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%54)
%57 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%52#1, %arg1)
%58 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %57)
%59 = arith.cmpi eq, %58, %c0 : index
%60 = arith.ori %59, %55 : i1
%61 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%58)
%62 = scf.if %60 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%64 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%57]
%extracted_slice_9 = tensor.extract_slice %5[%53, %64] [%54, %58] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_9 low[0, 0] high[%56, %61] {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%63 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%62 : tensor<1x1xf16>) outs(%extracted_slice_7 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_8 = tensor.insert_slice %63 into %arg13[%52#0, %52#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_8 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %26#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%extracted_slice_3 = tensor.extract_slice %expanded[%arg3, 0, %arg5, %18, 0, %19] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf16> to tensor<1x1x1x1x1x4xf16>
%27 = vector.transfer_read %extracted_slice_3[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x1x4xf16>, vector<1x1x1x1x1x4xf16>
%28 = vector.transpose %27, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expanded_4 = tensor.expand_shape %26#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%extracted_slice_5 = tensor.extract_slice %expanded_4[0, %19, 0, %18] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x16x1x16xf16> to tensor<1x4x1x1xf16>
%29 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x4x1x1xf16>, vector<1x4x1x1xf16>
%30 = vector.transpose %29, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%31 = vector.transfer_read %arg11[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
%32 = iree_gpu.multi_mma %28, %30, %31 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
%33 = vector.transfer_write %32, %arg11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x1xf32>, tensor<1x1x1x1x4x1xf32>
scf.yield %33 : tensor<1x1x1x1x4x1xf32>
}
%extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%23 = vector.transfer_read %22[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x1xf32>, vector<1x1x1x1x4x1xf32>
%24 = vector.transpose %23, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
%25 = vector.transfer_write %24, %extracted_slice_2[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %25 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_2 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_2) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_4 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_4 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
%23:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%29 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%30:3 = affine.delinearize_index %29 into (2, 32, 4) : index, index, index
%31 = affine.apply affine_map<(d0) -> (d0 * 4)>(%30#2)
%32 = affine.min affine_map<(d0) -> (2, d0)>(%30#0)
%33 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%32)
%34 = arith.cmpi eq, %33, %c0 : index
%35 = affine.min affine_map<(d0) -> (17, d0)>(%30#1)
%36 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%35)
%37 = arith.cmpi eq, %36, %c0 : index
%38 = arith.ori %37, %34 : i1
%39 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%36)
%40 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %30#2)
%41 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%40)
%42 = arith.cmpi eq, %41, %c0 : index
%43 = arith.ori %42, %38 : i1
%44 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%41)
%45 = scf.if %43 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_6 = tensor.extract_slice %3[%32, 0, 0, 0] [%33, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%59 = tensor.empty(%33, %36, %41) : tensor<?x1x?x?xf16>
%60 = scf.for %arg14 = %c0 to %33 step %c1 iter_args(%arg15 = %59) -> (tensor<?x1x?x?xf16>) {
%61 = scf.for %arg16 = %c0 to %36 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
%62 = scf.for %arg18 = %c0 to %41 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
%63 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%40, %arg18)
%64 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %35, %40, %arg18)
%65 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %35, %40, %arg18)
%extracted_slice_7 = tensor.extract_slice %extracted_slice_6[%arg14, %64, %65, %63] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
%extracted_slice_8 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
%66 = linalg.copy ins(%extracted_slice_7 : tensor<1x1x1x1xf16>) outs(%extracted_slice_8 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
%inserted_slice_9 = tensor.insert_slice %66 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
scf.yield %inserted_slice_9 : tensor<?x1x?x?xf16>
}
scf.yield %62 : tensor<?x1x?x?xf16>
}
scf.yield %61 : tensor<?x1x?x?xf16>
}
%padded = tensor.pad %60 low[0, 0, 0, 0] high[0, 0, %39, %44] {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_3 = tensor.extract_slice %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%46 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%45 : tensor<1x1x1x4xf16>) outs(%extracted_slice_3 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %46 into %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%47:2 = affine.delinearize_index %29 into (16, 16) : index, index
%extracted_slice_4 = tensor.extract_slice %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%48 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%47#0]
%49 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%48)
%50 = arith.cmpi eq, %49, %c0 : index
%51 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%49)
%52 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%47#1, %arg1)
%53 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %52)
%54 = arith.cmpi eq, %53, %c0 : index
%55 = arith.ori %54, %50 : i1
%56 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%53)
%57 = scf.if %55 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%59 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%52]
%extracted_slice_6 = tensor.extract_slice %5[%48, %59] [%49, %53] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_6 low[0, 0] high[%51, %56] {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%58 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%57 : tensor<1x1xf16>) outs(%extracted_slice_4 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_5 = tensor.insert_slice %58 into %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_5 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %23#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%24 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %18, %c0, %19], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
%25 = vector.transpose %24, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expanded_2 = tensor.expand_shape %23#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%26 = vector.transfer_read %expanded_2[%c0, %19, %c0, %18], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
%27 = vector.transpose %26, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%28 = iree_gpu.multi_mma %25, %27, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %28 : vector<1x1x1x1x4x1xf32>
}
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%21 = vector.transpose %20, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
%22 = vector.transfer_write %21, %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %22 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_2 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
%23:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%29 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%30:3 = affine.delinearize_index %29 into (2, 32, 4) : index, index, index
%31 = affine.apply affine_map<(d0) -> (d0 * 4)>(%30#2)
%32 = affine.min affine_map<(d0) -> (2, d0)>(%30#0)
%33 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%32)
%34 = arith.cmpi eq, %33, %c0 : index
%35 = affine.min affine_map<(d0) -> (17, d0)>(%30#1)
%36 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%35)
%37 = arith.cmpi eq, %36, %c0 : index
%38 = arith.ori %37, %34 : i1
%39 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%36)
%40 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %30#2)
%41 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%40)
%42 = arith.cmpi eq, %41, %c0 : index
%43 = arith.ori %42, %38 : i1
%44 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%41)
%45 = scf.if %43 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_6 = tensor.extract_slice %3[%32, 0, 0, 0] [%33, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%59 = tensor.empty(%33, %36, %41) : tensor<?x1x?x?xf16>
%60 = scf.for %arg14 = %c0 to %33 step %c1 iter_args(%arg15 = %59) -> (tensor<?x1x?x?xf16>) {
%61 = scf.for %arg16 = %c0 to %36 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
%62 = scf.for %arg18 = %c0 to %41 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
%63 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%40, %arg18)
%64 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %35, %40, %arg18)
%65 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %35, %40, %arg18)
%extracted_slice_7 = tensor.extract_slice %extracted_slice_6[%arg14, %64, %65, %63] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
%extracted_slice_8 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
%66 = linalg.copy ins(%extracted_slice_7 : tensor<1x1x1x1xf16>) outs(%extracted_slice_8 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
%inserted_slice_9 = tensor.insert_slice %66 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
scf.yield %inserted_slice_9 : tensor<?x1x?x?xf16>
}
scf.yield %62 : tensor<?x1x?x?xf16>
}
scf.yield %61 : tensor<?x1x?x?xf16>
}
%padded = tensor.pad %60 low[0, 0, 0, 0] high[0, 0, %39, %44] {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_3 = tensor.extract_slice %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%46 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%45 : tensor<1x1x1x4xf16>) outs(%extracted_slice_3 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %46 into %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%47:2 = affine.delinearize_index %29 into (16, 16) : index, index
%extracted_slice_4 = tensor.extract_slice %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%48 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%47#0]
%49 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%48)
%50 = arith.cmpi eq, %49, %c0 : index
%51 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%49)
%52 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%47#1, %arg1)
%53 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %52)
%54 = arith.cmpi eq, %53, %c0 : index
%55 = arith.ori %54, %50 : i1
%56 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%53)
%57 = scf.if %55 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%59 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%52]
%extracted_slice_6 = tensor.extract_slice %5[%48, %59] [%49, %53] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_6 low[0, 0] high[%51, %56] {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%58 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%57 : tensor<1x1xf16>) outs(%extracted_slice_4 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_5 = tensor.insert_slice %58 into %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_5 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %23#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%24 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %18, %c0, %19], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
%25 = vector.transpose %24, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expanded_2 = tensor.expand_shape %23#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%26 = vector.transfer_read %expanded_2[%c0, %19, %c0, %18], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
%27 = vector.transpose %26, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%28 = iree_gpu.multi_mma %25, %27, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %28 : vector<1x1x1x1x4x1xf32>
}
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%21 = vector.transpose %20, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
%22 = vector.transfer_write %21, %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %22 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_2 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
%23:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%29 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%30:3 = affine.delinearize_index %29 into (2, 32, 4) : index, index, index
%31 = affine.apply affine_map<(d0) -> (d0 * 4)>(%30#2)
%32 = affine.min affine_map<(d0) -> (2, d0)>(%30#0)
%33 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%32)
%34 = arith.cmpi eq, %33, %c0 : index
%35 = affine.min affine_map<(d0) -> (17, d0)>(%30#1)
%36 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%35)
%37 = arith.cmpi eq, %36, %c0 : index
%38 = arith.ori %37, %34 : i1
%39 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%36)
%40 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %30#2)
%41 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%40)
%42 = arith.cmpi eq, %41, %c0 : index
%43 = arith.ori %42, %38 : i1
%44 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%41)
%45 = scf.if %43 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_6 = tensor.extract_slice %3[%32, 0, 0, 0] [%33, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%59 = tensor.empty(%33, %36, %41) : tensor<?x1x?x?xf16>
%60 = scf.for %arg14 = %c0 to %33 step %c1 iter_args(%arg15 = %59) -> (tensor<?x1x?x?xf16>) {
%61 = scf.for %arg16 = %c0 to %36 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
%62 = scf.for %arg18 = %c0 to %41 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
%63 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%40, %arg18)
%64 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %35, %40, %arg18)
%65 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %35, %40, %arg18)
%extracted_slice_7 = tensor.extract_slice %extracted_slice_6[%arg14, %64, %65, %63] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
%extracted_slice_8 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
%66 = linalg.copy ins(%extracted_slice_7 : tensor<1x1x1x1xf16>) outs(%extracted_slice_8 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
%inserted_slice_9 = tensor.insert_slice %66 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
scf.yield %inserted_slice_9 : tensor<?x1x?x?xf16>
}
scf.yield %62 : tensor<?x1x?x?xf16>
}
scf.yield %61 : tensor<?x1x?x?xf16>
}
%padded = tensor.pad %60 low[0, 0, 0, 0] high[0, 0, %39, %44] {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_3 = tensor.extract_slice %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%46 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%45 : tensor<1x1x1x4xf16>) outs(%extracted_slice_3 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %46 into %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%47:2 = affine.delinearize_index %29 into (16, 16) : index, index
%extracted_slice_4 = tensor.extract_slice %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%48 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%47#0]
%49 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%48)
%50 = arith.cmpi eq, %49, %c0 : index
%51 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%49)
%52 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%47#1, %arg1)
%53 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %52)
%54 = arith.cmpi eq, %53, %c0 : index
%55 = arith.ori %54, %50 : i1
%56 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%53)
%57 = scf.if %55 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%59 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%52]
%extracted_slice_6 = tensor.extract_slice %5[%48, %59] [%49, %53] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_6 low[0, 0] high[%51, %56] {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%58 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%57 : tensor<1x1xf16>) outs(%extracted_slice_4 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_5 = tensor.insert_slice %58 into %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_5 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %23#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%24 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %18, %c0, %19], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
%25 = vector.transpose %24, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expanded_2 = tensor.expand_shape %23#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%26 = vector.transfer_read %expanded_2[%c0, %19, %c0, %18], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
%27 = vector.transpose %26, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%28 = iree_gpu.multi_mma %25, %27, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %28 : vector<1x1x1x1x4x1xf32>
}
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%21 = vector.transpose %20, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
%22 = vector.transfer_write %21, %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %22 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_2 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
%23:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%29 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%30:3 = affine.delinearize_index %29 into (2, 32, 4) : index, index, index
%31 = affine.apply affine_map<(d0) -> (d0 * 4)>(%30#2)
%32 = affine.min affine_map<(d0) -> (2, d0)>(%30#0)
%33 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%32)
%34 = arith.cmpi eq, %33, %c0 : index
%35 = affine.min affine_map<(d0) -> (17, d0)>(%30#1)
%36 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%35)
%37 = arith.cmpi eq, %36, %c0 : index
%38 = arith.ori %37, %34 : i1
%39 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%36)
%40 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %30#2)
%41 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%40)
%42 = arith.cmpi eq, %41, %c0 : index
%43 = arith.ori %42, %38 : i1
%44 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%41)
%45 = scf.if %43 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_6 = tensor.extract_slice %3[%32, 0, 0, 0] [%33, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%59 = tensor.empty(%33, %36, %41) : tensor<?x1x?x?xf16>
%60 = scf.for %arg14 = %c0 to %33 step %c1 iter_args(%arg15 = %59) -> (tensor<?x1x?x?xf16>) {
%61 = scf.for %arg16 = %c0 to %36 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
%62 = scf.for %arg18 = %c0 to %41 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
%63 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%40, %arg18)
%64 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %35, %40, %arg18)
%65 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %35, %40, %arg18)
%extracted_slice_7 = tensor.extract_slice %extracted_slice_6[%arg14, %64, %65, %63] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
%extracted_slice_8 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
%66 = linalg.copy ins(%extracted_slice_7 : tensor<1x1x1x1xf16>) outs(%extracted_slice_8 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
%inserted_slice_9 = tensor.insert_slice %66 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
scf.yield %inserted_slice_9 : tensor<?x1x?x?xf16>
}
scf.yield %62 : tensor<?x1x?x?xf16>
}
scf.yield %61 : tensor<?x1x?x?xf16>
}
%padded = tensor.pad %60 low[0, 0, 0, 0] high[0, 0, %39, %44] {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_3 = tensor.extract_slice %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%46 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%45 : tensor<1x1x1x4xf16>) outs(%extracted_slice_3 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %46 into %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%47:2 = affine.delinearize_index %29 into (16, 16) : index, index
%extracted_slice_4 = tensor.extract_slice %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%48 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%47#0]
%49 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%48)
%50 = arith.cmpi eq, %49, %c0 : index
%51 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%49)
%52 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%47#1, %arg1)
%53 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %52)
%54 = arith.cmpi eq, %53, %c0 : index
%55 = arith.ori %54, %50 : i1
%56 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%53)
%57 = scf.if %55 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%59 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%52]
%extracted_slice_6 = tensor.extract_slice %5[%48, %59] [%49, %53] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_6 low[0, 0] high[%51, %56] {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%58 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%57 : tensor<1x1xf16>) outs(%extracted_slice_4 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_5 = tensor.insert_slice %58 into %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_5 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %23#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%24 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %18, %c0, %19], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
%25 = vector.transpose %24, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expanded_2 = tensor.expand_shape %23#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%26 = vector.transfer_read %expanded_2[%c0, %19, %c0, %18], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
%27 = vector.transpose %26, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%28 = iree_gpu.multi_mma %25, %27, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %28 : vector<1x1x1x1x4x1xf32>
}
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%21 = vector.transpose %20, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
%22 = vector.transfer_write %21, %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %22 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_2 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After GPUCombineValueBarriersPass (iree-codegen-gpu-combine-value-barriers) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = tensor.empty() : tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%17 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice) -> (tensor<1x1x1x16x1x16xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%20 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
%23:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%29 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%30:3 = affine.delinearize_index %29 into (2, 32, 4) : index, index, index
%31 = affine.apply affine_map<(d0) -> (d0 * 4)>(%30#2)
%32 = affine.min affine_map<(d0) -> (2, d0)>(%30#0)
%33 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%32)
%34 = arith.cmpi eq, %33, %c0 : index
%35 = affine.min affine_map<(d0) -> (17, d0)>(%30#1)
%36 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%35)
%37 = arith.cmpi eq, %36, %c0 : index
%38 = arith.ori %37, %34 : i1
%39 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%36)
%40 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %30#2)
%41 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%40)
%42 = arith.cmpi eq, %41, %c0 : index
%43 = arith.ori %42, %38 : i1
%44 = affine.apply affine_map<(d0) -> (-d0 + 4)>(%41)
%45 = scf.if %43 -> (tensor<1x1x1x4xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<1x1x1x4xf16>
scf.yield %generated : tensor<1x1x1x4xf16>
} else {
%extracted_slice_6 = tensor.extract_slice %3[%32, 0, 0, 0] [%33, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%59 = tensor.empty(%33, %36, %41) : tensor<?x1x?x?xf16>
%60 = scf.for %arg14 = %c0 to %33 step %c1 iter_args(%arg15 = %59) -> (tensor<?x1x?x?xf16>) {
%61 = scf.for %arg16 = %c0 to %36 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
%62 = scf.for %arg18 = %c0 to %41 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
%63 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%40, %arg18)
%64 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %35, %40, %arg18)
%65 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %35, %40, %arg18)
%extracted_slice_7 = tensor.extract_slice %extracted_slice_6[%arg14, %64, %65, %63] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
%extracted_slice_8 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
%66 = linalg.copy ins(%extracted_slice_7 : tensor<1x1x1x1xf16>) outs(%extracted_slice_8 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
%inserted_slice_9 = tensor.insert_slice %66 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
scf.yield %inserted_slice_9 : tensor<?x1x?x?xf16>
}
scf.yield %62 : tensor<?x1x?x?xf16>
}
scf.yield %61 : tensor<?x1x?x?xf16>
}
%padded = tensor.pad %60 low[0, 0, 0, 0] high[0, 0, %39, %44] {
^bb0(%arg14: index, %arg15: index, %arg16: index, %arg17: index):
tensor.yield %cst_0 : f16
} : tensor<?x1x?x?xf16> to tensor<1x1x1x4xf16>
scf.yield %padded : tensor<1x1x1x4xf16>
}
%extracted_slice_3 = tensor.extract_slice %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%46 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%45 : tensor<1x1x1x4xf16>) outs(%extracted_slice_3 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %46 into %arg12[%30#0, 0, %30#1, %31] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%47:2 = affine.delinearize_index %29 into (16, 16) : index, index
%extracted_slice_4 = tensor.extract_slice %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%48 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%47#0]
%49 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%48)
%50 = arith.cmpi eq, %49, %c0 : index
%51 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%49)
%52 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%47#1, %arg1)
%53 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %52)
%54 = arith.cmpi eq, %53, %c0 : index
%55 = arith.ori %54, %50 : i1
%56 = affine.apply affine_map<(d0) -> (-d0 + 1)>(%53)
%57 = scf.if %55 -> (tensor<1x1xf16>) {
%generated = tensor.generate {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<1x1xf16>
scf.yield %generated : tensor<1x1xf16>
} else {
%59 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%52]
%extracted_slice_6 = tensor.extract_slice %5[%48, %59] [%49, %53] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%padded = tensor.pad %extracted_slice_6 low[0, 0] high[%51, %56] {
^bb0(%arg14: index, %arg15: index):
tensor.yield %cst_0 : f16
} : tensor<?x?xf16> to tensor<1x1xf16>
scf.yield %padded : tensor<1x1xf16>
}
%58 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%57 : tensor<1x1xf16>) outs(%extracted_slice_4 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_5 = tensor.insert_slice %58 into %arg13[%47#0, %47#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_5 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %23#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%24 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %18, %c0, %19], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
%25 = vector.transpose %24, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expanded_2 = tensor.expand_shape %23#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%26 = vector.transfer_read %expanded_2[%c0, %19, %c0, %18], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
%27 = vector.transpose %26, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%28 = iree_gpu.multi_mma %25, %27, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %28 : vector<1x1x1x1x4x1xf32>
}
%extracted_slice_1 = tensor.extract_slice %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%21 = vector.transpose %20, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
%22 = vector.transfer_write %21, %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %22 into %arg9[0, 0, 0, %19, 0, %18] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%14 = tensor.empty(%8) : tensor<2x1x17x?xf32>
%15 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%16 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %15) shared_outs(%arg7 = %14) -> (tensor<2x1x17x?xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%18 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%19 = scf.for %arg8 = %c0 to %18 step %c1 iter_args(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%20 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_3 : tensor<1x1x1x1xf32>) outs(%extracted_slice_2 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %20 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg7[%arg3, 0, %arg5, %17] [1, 1, 1, %18] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>> -> tensor<2x17x17x1281xf32>
%5 = tensor.empty() : tensor<2x17x17x1281xf32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%7 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%8 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%9 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%11 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%12 = tensor.empty() : tensor<2x1x2x16x1x16xf32>
%13 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %12) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%18 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x16x1x16xf32>) {
%19 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%20 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%21 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
%24:2 = iree_gpu.barrier_region ins(%10, %11 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%30 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%31:3 = affine.delinearize_index %30 into (2, 32, 4) : index, index, index
%32 = affine.apply affine_map<(d0) -> (d0 * 4)>(%31#2)
%33 = affine.min affine_map<(d0) -> (2, d0)>(%31#0)
%34 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%33)
%35 = arith.cmpi eq, %34, %c0 : index
%36 = affine.min affine_map<(d0) -> (17, d0)>(%31#1)
%37 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%36)
%38 = arith.cmpi eq, %37, %c0 : index
%39 = arith.ori %38, %35 : i1
%40 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %31#2)
%41 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%40)
%42 = arith.cmpi eq, %41, %c0 : index
%43 = arith.ori %42, %39 : i1
%44 = scf.if %43 -> (tensor<1x1x1x4xf16>) {
%56 = tensor.empty() : tensor<1x1x1x4xf16>
%57 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%56 : tensor<1x1x1x4xf16>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
} -> tensor<1x1x1x4xf16>
scf.yield %57 : tensor<1x1x1x4xf16>
} else {
%extracted_slice_7 = tensor.extract_slice %3[%33, 0, 0, 0] [%34, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%56 = tensor.empty(%34, %37, %41) : tensor<?x1x?x?xf16>
%57 = scf.for %arg14 = %c0 to %34 step %c1 iter_args(%arg15 = %56) -> (tensor<?x1x?x?xf16>) {
%60 = scf.for %arg16 = %c0 to %37 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
%61 = scf.for %arg18 = %c0 to %41 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
%62 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%40, %arg18)
%63 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %36, %40, %arg18)
%64 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %36, %40, %arg18)
%extracted_slice_11 = tensor.extract_slice %extracted_slice_7[%arg14, %63, %64, %62] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
%extracted_slice_12 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
%65 = linalg.copy ins(%extracted_slice_11 : tensor<1x1x1x1xf16>) outs(%extracted_slice_12 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
%inserted_slice_13 = tensor.insert_slice %65 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
scf.yield %inserted_slice_13 : tensor<?x1x?x?xf16>
}
scf.yield %61 : tensor<?x1x?x?xf16>
}
scf.yield %60 : tensor<?x1x?x?xf16>
}
%58 = tensor.empty() : tensor<1x1x1x4xf16>
%59 = linalg.fill ins(%cst_0 : f16) outs(%58 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%dim = tensor.dim %57, %c0 : tensor<?x1x?x?xf16>
%dim_8 = tensor.dim %57, %c2 : tensor<?x1x?x?xf16>
%dim_9 = tensor.dim %57, %c3 : tensor<?x1x?x?xf16>
%inserted_slice_10 = tensor.insert_slice %57 into %59[0, 0, 0, 0] [%dim, 1, %dim_8, %dim_9] [1, 1, 1, 1] : tensor<?x1x?x?xf16> into tensor<1x1x1x4xf16>
scf.yield %inserted_slice_10 : tensor<1x1x1x4xf16>
}
%extracted_slice_4 = tensor.extract_slice %arg12[%31#0, 0, %31#1, %32] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%45 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%44 : tensor<1x1x1x4xf16>) outs(%extracted_slice_4 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %45 into %arg12[%31#0, 0, %31#1, %32] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%46:2 = affine.delinearize_index %30 into (16, 16) : index, index
%extracted_slice_5 = tensor.extract_slice %arg13[%46#0, %46#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%47 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%46#0]
%48 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%47)
%49 = arith.cmpi eq, %48, %c0 : index
%50 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%46#1, %arg1)
%51 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%9, %50)
%52 = arith.cmpi eq, %51, %c0 : index
%53 = arith.ori %52, %49 : i1
%54 = scf.if %53 -> (tensor<1x1xf16>) {
%56 = tensor.empty() : tensor<1x1xf16>
%57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%extracted_slice_5 : tensor<1x1xf16>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
} -> tensor<1x1xf16>
scf.yield %57 : tensor<1x1xf16>
} else {
%56 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%50]
%extracted_slice_7 = tensor.extract_slice %6[%47, %56] [%48, %51] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%57 = tensor.empty() : tensor<1x1xf16>
%58 = linalg.fill ins(%cst_0 : f16) outs(%extracted_slice_5 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_8 = tensor.insert_slice %extracted_slice_7 into %58[0, 0] [%48, %51] [1, 1] : tensor<?x?xf16> into tensor<1x1xf16>
scf.yield %inserted_slice_8 : tensor<1x1xf16>
}
%55 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%54 : tensor<1x1xf16>) outs(%extracted_slice_5 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_6 = tensor.insert_slice %55 into %arg13[%46#0, %46#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_6 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %24#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%25 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %19, %c0, %20], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
%26 = vector.transpose %25, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expanded_3 = tensor.expand_shape %24#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%27 = vector.transfer_read %expanded_3[%c0, %20, %c0, %19], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
%28 = vector.transpose %27, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%29 = iree_gpu.multi_mma %26, %28, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %29 : vector<1x1x1x1x4x1xf32>
}
%extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %20, 0, %19] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%22 = vector.transpose %21, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
%23 = vector.transfer_write %22, %extracted_slice_2[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %23 into %arg9[0, 0, 0, %20, 0, %19] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %13 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%extracted_slice = tensor.extract_slice %arg2[0, %arg0, 0, %8] [2, 1, 17, %9] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
%15 = tensor.empty(%9) : tensor<2x1x17x?xf32>
%16 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%9)
%17 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %16) shared_outs(%arg7 = %extracted_slice) -> (tensor<2x1x17x?xf32>) {
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%19 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%9]
%extracted_slice_1 = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %18] [1, 1, 1, %19] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_2 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %18] [1, 1, 1, %19] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%20 = scf.for %arg8 = %c0 to %19 step %c1 iter_args(%arg9 = %extracted_slice_2) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%21 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %21 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %20 into %arg7[%arg3, 0, %arg5, %18] [1, 1, 1, %19] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg2[0, %arg0, 0, %8] [2, 1, 17, %9] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>> -> tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = bufferization.alloc_tensor() : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%16 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x16x1x16xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%18 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%19 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
%22:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%28 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%29:3 = affine.delinearize_index %28 into (2, 32, 4) : index, index, index
%30 = affine.apply affine_map<(d0) -> (d0 * 4)>(%29#2)
%31 = affine.min affine_map<(d0) -> (2, d0)>(%29#0)
%32 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%31)
%33 = arith.cmpi eq, %32, %c0 : index
%34 = affine.min affine_map<(d0) -> (17, d0)>(%29#1)
%35 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = arith.ori %36, %33 : i1
%38 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %29#2)
%39 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%38)
%40 = arith.cmpi eq, %39, %c0 : index
%41 = arith.ori %40, %37 : i1
%42 = scf.if %41 -> (tensor<1x1x1x4xf16>) {
%54 = bufferization.alloc_tensor() : tensor<1x1x1x4xf16>
%55 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%54 : tensor<1x1x1x4xf16>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
} -> tensor<1x1x1x4xf16>
scf.yield %55 : tensor<1x1x1x4xf16>
} else {
%extracted_slice_7 = tensor.extract_slice %3[%31, 0, 0, 0] [%32, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%54 = bufferization.alloc_tensor(%32, %35, %39) : tensor<?x1x?x?xf16>
%55 = scf.for %arg14 = %c0 to %32 step %c1 iter_args(%arg15 = %54) -> (tensor<?x1x?x?xf16>) {
%58 = scf.for %arg16 = %c0 to %35 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
%59 = scf.for %arg18 = %c0 to %39 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
%60 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%38, %arg18)
%61 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %34, %38, %arg18)
%62 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %34, %38, %arg18)
%extracted_slice_11 = tensor.extract_slice %extracted_slice_7[%arg14, %61, %62, %60] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
%extracted_slice_12 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
%63 = linalg.copy ins(%extracted_slice_11 : tensor<1x1x1x1xf16>) outs(%extracted_slice_12 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
%inserted_slice_13 = tensor.insert_slice %63 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
scf.yield %inserted_slice_13 : tensor<?x1x?x?xf16>
}
scf.yield %59 : tensor<?x1x?x?xf16>
}
scf.yield %58 : tensor<?x1x?x?xf16>
}
%56 = bufferization.alloc_tensor() : tensor<1x1x1x4xf16>
%57 = linalg.fill ins(%cst_0 : f16) outs(%56 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%dim = tensor.dim %55, %c0 : tensor<?x1x?x?xf16>
%dim_8 = tensor.dim %55, %c2 : tensor<?x1x?x?xf16>
%dim_9 = tensor.dim %55, %c3 : tensor<?x1x?x?xf16>
%inserted_slice_10 = tensor.insert_slice %55 into %57[0, 0, 0, 0] [%dim, 1, %dim_8, %dim_9] [1, 1, 1, 1] : tensor<?x1x?x?xf16> into tensor<1x1x1x4xf16>
scf.yield %inserted_slice_10 : tensor<1x1x1x4xf16>
}
%extracted_slice_4 = tensor.extract_slice %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%43 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%42 : tensor<1x1x1x4xf16>) outs(%extracted_slice_4 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %43 into %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%44:2 = affine.delinearize_index %28 into (16, 16) : index, index
%extracted_slice_5 = tensor.extract_slice %arg13[%44#0, %44#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%45 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%44#0]
%46 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%45)
%47 = arith.cmpi eq, %46, %c0 : index
%48 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%44#1, %arg1)
%49 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %48)
%50 = arith.cmpi eq, %49, %c0 : index
%51 = arith.ori %50, %47 : i1
%52 = scf.if %51 -> (tensor<1x1xf16>) {
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%extracted_slice_5 : tensor<1x1xf16>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
} -> tensor<1x1xf16>
scf.yield %54 : tensor<1x1xf16>
} else {
%54 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%48]
%extracted_slice_7 = tensor.extract_slice %5[%45, %54] [%46, %49] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%55 = linalg.fill ins(%cst_0 : f16) outs(%extracted_slice_5 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_8 = tensor.insert_slice %extracted_slice_7 into %55[0, 0] [%46, %49] [1, 1] : tensor<?x?xf16> into tensor<1x1xf16>
scf.yield %inserted_slice_8 : tensor<1x1xf16>
}
%53 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%52 : tensor<1x1xf16>) outs(%extracted_slice_5 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_6 = tensor.insert_slice %53 into %arg13[%44#0, %44#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_6 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %22#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%23 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %17, %c0, %18], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
%24 = vector.transpose %23, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expanded_3 = tensor.expand_shape %22#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%25 = vector.transfer_read %expanded_3[%c0, %18, %c0, %17], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
%26 = vector.transpose %25, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%27 = iree_gpu.multi_mma %24, %26, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %27 : vector<1x1x1x1x4x1xf32>
}
%extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %18, 0, %17] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%20 = vector.transpose %19, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
%21 = vector.transfer_write %20, %extracted_slice_2[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %21 into %arg9[0, 0, 0, %18, 0, %17] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%extracted_slice = tensor.extract_slice %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
%14 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%15 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %14) shared_outs(%arg7 = %extracted_slice) -> (tensor<2x1x17x?xf32>) {
%16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%17 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice_1 = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %16] [1, 1, 1, %17] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_2 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %16] [1, 1, 1, %17] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%18 = scf.for %arg8 = %c0 to %17 step %c1 iter_args(%arg9 = %extracted_slice_2) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%19 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %19 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg7[%arg3, 0, %arg5, %16] [1, 1, 1, %17] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %15 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After GPUInferMemorySpacePass (iree-codegen-gpu-infer-memory-space) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
%4 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>> -> tensor<2x17x17x1281xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [11529, 1281], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11529x1281xf16>> -> tensor<11529x1281xf16>
%6 = scf.forall (%arg0, %arg1) in (17, 81) shared_outs(%arg2 = %4) -> (tensor<2x17x17x1281xf32>) {
%7 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%8 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf16>
%10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x16xf16>
%11 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x2x16x1x16xf32>
%12 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 2, 1) shared_outs(%arg7 = %11) -> (tensor<2x1x2x16x1x16xf32>) {
%extracted_slice_1 = tensor.extract_slice %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<2x1x2x16x1x16xf32> to tensor<1x1x1x16x1x16xf32>
%16 = scf.forall (%arg8) in (64) shared_outs(%arg9 = %extracted_slice_1) -> (tensor<1x1x1x16x1x16xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg8)
%18 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg8)
%19 = scf.for %arg10 = %c0 to %c721 step %c1 iter_args(%arg11 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
%22:2 = iree_gpu.barrier_region ins(%9, %10 : tensor<2x1x32x16xf16>, tensor<16x16xf16>) {
^bb0(%arg12: tensor<2x1x32x16xf16>, %arg13: tensor<16x16xf16>):
%28 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg8, %arg5, %arg3)
%29:3 = affine.delinearize_index %28 into (2, 32, 4) : index, index, index
%30 = affine.apply affine_map<(d0) -> (d0 * 4)>(%29#2)
%31 = affine.min affine_map<(d0) -> (2, d0)>(%29#0)
%32 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%31)
%33 = arith.cmpi eq, %32, %c0 : index
%34 = affine.min affine_map<(d0) -> (17, d0)>(%29#1)
%35 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = arith.ori %36, %33 : i1
%38 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg10, %29#2)
%39 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%38)
%40 = arith.cmpi eq, %39, %c0 : index
%41 = arith.ori %40, %37 : i1
%42 = scf.if %41 -> (tensor<1x1x1x4xf16>) {
%54 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<private>} : tensor<1x1x1x4xf16>
%55 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%54 : tensor<1x1x1x4xf16>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
} -> tensor<1x1x1x4xf16>
scf.yield %55 : tensor<1x1x1x4xf16>
} else {
%extracted_slice_7 = tensor.extract_slice %3[%31, 0, 0, 0] [%32, 35, 35, 1281] [1, 1, 1, 1] : tensor<2x35x35x1281xf16> to tensor<?x35x35x1281xf16>
%54 = bufferization.alloc_tensor(%32, %35, %39) {memory_space = #gpu.address_space<private>} : tensor<?x1x?x?xf16>
%55 = scf.for %arg14 = %c0 to %32 step %c1 iter_args(%arg15 = %54) -> (tensor<?x1x?x?xf16>) {
%58 = scf.for %arg16 = %c0 to %35 step %c1 iter_args(%arg17 = %arg15) -> (tensor<?x1x?x?xf16>) {
%59 = scf.for %arg18 = %c0 to %39 step %c1 iter_args(%arg19 = %arg17) -> (tensor<?x1x?x?xf16>) {
%60 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%38, %arg18)
%61 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg16, %arg0, %34, %38, %arg18)
%62 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg16, %arg0, %34, %38, %arg18)
%extracted_slice_11 = tensor.extract_slice %extracted_slice_7[%arg14, %61, %62, %60] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x35x35x1281xf16> to tensor<1x1x1x1xf16>
%extracted_slice_12 = tensor.extract_slice %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<?x1x?x?xf16> to tensor<1x1x1x1xf16>
%63 = linalg.copy ins(%extracted_slice_11 : tensor<1x1x1x1xf16>) outs(%extracted_slice_12 : tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
%inserted_slice_13 = tensor.insert_slice %63 into %arg19[%arg14, 0, %arg16, %arg18] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf16> into tensor<?x1x?x?xf16>
scf.yield %inserted_slice_13 : tensor<?x1x?x?xf16>
}
scf.yield %59 : tensor<?x1x?x?xf16>
}
scf.yield %58 : tensor<?x1x?x?xf16>
}
%56 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<private>} : tensor<1x1x1x4xf16>
%57 = linalg.fill ins(%cst_0 : f16) outs(%56 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%dim = tensor.dim %55, %c0 : tensor<?x1x?x?xf16>
%dim_8 = tensor.dim %55, %c2 : tensor<?x1x?x?xf16>
%dim_9 = tensor.dim %55, %c3 : tensor<?x1x?x?xf16>
%inserted_slice_10 = tensor.insert_slice %55 into %57[0, 0, 0, 0] [%dim, 1, %dim_8, %dim_9] [1, 1, 1, 1] : tensor<?x1x?x?xf16> into tensor<1x1x1x4xf16>
scf.yield %inserted_slice_10 : tensor<1x1x1x4xf16>
}
%extracted_slice_4 = tensor.extract_slice %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x1x32x16xf16> to tensor<1x1x1x4xf16>
%43 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%42 : tensor<1x1x1x4xf16>) outs(%extracted_slice_4 : tensor<1x1x1x4xf16>) -> tensor<1x1x1x4xf16>
%inserted_slice = tensor.insert_slice %43 into %arg12[%29#0, 0, %29#1, %30] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<1x1x1x4xf16> into tensor<2x1x32x16xf16>
%44:2 = affine.delinearize_index %28 into (16, 16) : index, index
%extracted_slice_5 = tensor.extract_slice %arg13[%44#0, %44#1] [1, 1] [1, 1] : tensor<16x16xf16> to tensor<1x1xf16>
%45 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg10)[%44#0]
%46 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%45)
%47 = arith.cmpi eq, %46, %c0 : index
%48 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%44#1, %arg1)
%49 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%8, %48)
%50 = arith.cmpi eq, %49, %c0 : index
%51 = arith.ori %50, %47 : i1
%52 = scf.if %51 -> (tensor<1x1xf16>) {
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%extracted_slice_5 : tensor<1x1xf16>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
} -> tensor<1x1xf16>
scf.yield %54 : tensor<1x1xf16>
} else {
%54 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%48]
%extracted_slice_7 = tensor.extract_slice %5[%45, %54] [%46, %49] [1, 1] : tensor<11529x1281xf16> to tensor<?x?xf16>
%55 = linalg.fill ins(%cst_0 : f16) outs(%extracted_slice_5 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_8 = tensor.insert_slice %extracted_slice_7 into %55[0, 0] [%46, %49] [1, 1] : tensor<?x?xf16> into tensor<1x1xf16>
scf.yield %inserted_slice_8 : tensor<1x1xf16>
}
%53 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%52 : tensor<1x1xf16>) outs(%extracted_slice_5 : tensor<1x1xf16>) -> tensor<1x1xf16>
%inserted_slice_6 = tensor.insert_slice %53 into %arg13[%44#0, %44#1] [1, 1] [1, 1] : tensor<1x1xf16> into tensor<16x16xf16>
iree_gpu.yield %inserted_slice, %inserted_slice_6 : tensor<2x1x32x16xf16>, tensor<16x16xf16>
} : tensor<2x1x32x16xf16>, tensor<16x16xf16>
%expanded = tensor.expand_shape %22#0 [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : tensor<2x1x32x16xf16> into tensor<2x1x2x16x1x16xf16>
%23 = vector.transfer_read %expanded[%arg3, %c0, %arg5, %17, %c0, %18], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<2x1x2x16x1x16xf16>, vector<1x1x1x1x1x4xf16>
%24 = vector.transpose %23, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expanded_3 = tensor.expand_shape %22#1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : tensor<16x16xf16> into tensor<1x16x1x16xf16>
%25 = vector.transfer_read %expanded_3[%c0, %18, %c0, %17], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x16x1x16xf16>, vector<1x4x1x1xf16>
%26 = vector.transpose %25, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%27 = iree_gpu.multi_mma %24, %26, %arg11 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %27 : vector<1x1x1x1x4x1xf32>
}
%extracted_slice_2 = tensor.extract_slice %arg9[0, 0, 0, %18, 0, %17] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> to tensor<1x1x1x4x1x1xf32>
%20 = vector.transpose %19, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
%21 = vector.transfer_write %20, %extracted_slice_2[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, tensor<1x1x1x4x1x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %21 into %arg9[0, 0, 0, %18, 0, %17] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x4x1x1xf32> into tensor<1x1x1x16x1x16xf32>
}
} {mapping = [#iree_gpu.lane_id<0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg7[%arg3, 0, %arg5, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x16x1x16xf32> into tensor<2x1x2x16x1x16xf32>
}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x1x32x16xf32>
%collapsed = tensor.collapse_shape %12 [[0], [1], [2, 3], [4, 5]] : tensor<2x1x2x16x1x16xf32> into tensor<2x1x32x16xf32>
%extracted_slice = tensor.extract_slice %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x17x17x1281xf32> to tensor<2x1x17x?xf32>
%14 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%8)
%15 = scf.forall (%arg3, %arg4, %arg5, %arg6) in (2, 1, 17, %14) shared_outs(%arg7 = %extracted_slice) -> (tensor<2x1x17x?xf32>) {
%16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
%17 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg6)[%8]
%extracted_slice_1 = tensor.extract_slice %collapsed[%arg3, 0, %arg5, %16] [1, 1, 1, %17] [1, 1, 1, 1] : tensor<2x1x32x16xf32> to tensor<1x1x1x?xf32>
%extracted_slice_2 = tensor.extract_slice %arg7[%arg3, 0, %arg5, %16] [1, 1, 1, %17] [1, 1, 1, 1] : tensor<2x1x17x?xf32> to tensor<1x1x1x?xf32>
%18 = scf.for %arg8 = %c0 to %17 step %c1 iter_args(%arg9 = %extracted_slice_2) -> (tensor<1x1x1x?xf32>) {
%extracted_slice_3 = tensor.extract_slice %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%extracted_slice_4 = tensor.extract_slice %extracted_slice_1[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x?xf32> to tensor<1x1x1x1xf32>
%19 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_4 : tensor<1x1x1x1xf32>) outs(%extracted_slice_3 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
%inserted_slice = tensor.insert_slice %19 into %arg9[0, 0, 0, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x1x1x1xf32> into tensor<1x1x1x?xf32>
scf.yield %inserted_slice : tensor<1x1x1x?xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg7[%arg3, 0, %arg5, %16] [1, 1, 1, %17] [1, 1, 1, 1] : tensor<1x1x1x?xf32> into tensor<2x1x17x?xf32>
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %15 into %arg2[0, %arg0, 0, %7] [2, 1, 17, %8] [1, 1, 1, 1] : tensor<2x1x17x?xf32> into tensor<2x17x17x1281xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
return
}
// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
%subview_4 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.forall (%arg6) in (64) {
%6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
%7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
%8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
%11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
%13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
%14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
%15 = arith.cmpi eq, %14, %c0 : index
%16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
%17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
%18 = arith.cmpi eq, %17, %c0 : index
%19 = arith.ori %18, %15 : i1
%20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
%21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = arith.ori %22, %19 : i1
%24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_13 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
%39 = scf.for %arg9 = %c0 to %14 step %c1 iter_args(%arg10 = %alloca) -> (memref<?x1x?x?xf16, #gpu.address_space<private>>) {
%40 = scf.for %arg11 = %c0 to %17 step %c1 iter_args(%arg12 = %arg10) -> (memref<?x1x?x?xf16, #gpu.address_space<private>>) {
%41 = scf.for %arg13 = %c0 to %21 step %c1 iter_args(%arg14 = %arg12) -> (memref<?x1x?x?xf16, #gpu.address_space<private>>) {
%42 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg13)
%43 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg11, %arg0, %16, %20, %arg13)
%44 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg11, %arg0, %16, %20, %arg13)
%subview_18 = memref.subview %subview_13[%arg9, %43, %44, %42] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_19 = memref.subview %arg14[%arg9, 0, %arg11, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
linalg.copy ins(%subview_18 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_19 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
%subview_20 = memref.subview %arg14[%arg9, 0, %arg11, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
memref.copy %subview_19, %subview_20 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
scf.yield %arg14 : memref<?x1x?x?xf16, #gpu.address_space<private>>
}
scf.yield %41 : memref<?x1x?x?xf16, #gpu.address_space<private>>
}
scf.yield %40 : memref<?x1x?x?xf16, #gpu.address_space<private>>
}
%alloca_14 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_14 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%dim = memref.dim %39, %c0 : memref<?x1x?x?xf16, #gpu.address_space<private>>
%dim_15 = memref.dim %39, %c2 : memref<?x1x?x?xf16, #gpu.address_space<private>>
%dim_16 = memref.dim %39, %c3 : memref<?x1x?x?xf16, #gpu.address_space<private>>
%subview_17 = memref.subview %alloca_14[0, 0, 0, 0] [%dim, 1, %dim_15, %dim_16] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %39, %subview_17 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_14 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_8 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_8 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_9 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_9 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%25:2 = affine.delinearize_index %10 into (16, 16) : index, index
%subview_10 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
%27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
%30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
%31 = arith.cmpi eq, %30, %c0 : index
%32 = arith.ori %31, %28 : i1
%33 = scf.if %32 -> (memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
} else {
%39 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
%subview_13 = memref.subview %1[%26, %39] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_14 = memref.subview %subview_10[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_13, %subview_14 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.yield %subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%33 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_11 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_10, %subview_11 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%34 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%35 = vector.transpose %34, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_12 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%36 = vector.transfer_read %expand_shape_12[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%37 = vector.transpose %36, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%38 = iree_gpu.multi_mma %35, %37, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %38 : vector<1x1x1x1x4x1xf32>
}
%subview_6 = memref.subview %subview_4[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %9, %subview_6[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview_4[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_6, %subview_7 : memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
} {mapping = [#iree_gpu.lane_id<0>]}
%subview_5 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
%6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
%7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
%subview_4 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_5 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = scf.for %arg6 = %c0 to %7 step %c1 iter_args(%arg7 = %subview_5) -> (memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
%subview_7 = memref.subview %arg7[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_8 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_7 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
%subview_9 = memref.subview %arg7[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_7, %subview_9 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.yield %arg7 : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
%subview_6 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %8, %subview_6 : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%subview_3 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview, %subview_3 : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.copy %2, %2 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
%subview_4 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.forall (%arg6) in (64) {
%6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
%7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
%8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
%11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
%13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
%14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
%15 = arith.cmpi eq, %14, %c0 : index
%16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
%17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
%18 = arith.cmpi eq, %17, %c0 : index
%19 = arith.ori %18, %15 : i1
%20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
%21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = arith.ori %22, %19 : i1
%24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_13 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
%39 = scf.for %arg9 = %c0 to %14 step %c1 iter_args(%arg10 = %alloca) -> (memref<?x1x?x?xf16, #gpu.address_space<private>>) {
%40 = scf.for %arg11 = %c0 to %17 step %c1 iter_args(%arg12 = %arg10) -> (memref<?x1x?x?xf16, #gpu.address_space<private>>) {
%41 = scf.for %arg13 = %c0 to %21 step %c1 iter_args(%arg14 = %arg12) -> (memref<?x1x?x?xf16, #gpu.address_space<private>>) {
%42 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg13)
%43 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg11, %arg0, %16, %20, %arg13)
%44 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg11, %arg0, %16, %20, %arg13)
%subview_18 = memref.subview %subview_13[%arg9, %43, %44, %42] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_19 = memref.subview %arg14[%arg9, 0, %arg11, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
linalg.copy ins(%subview_18 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_19 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
%subview_20 = memref.subview %arg14[%arg9, 0, %arg11, %arg13] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
memref.copy %subview_19, %subview_20 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
scf.yield %arg14 : memref<?x1x?x?xf16, #gpu.address_space<private>>
}
scf.yield %41 : memref<?x1x?x?xf16, #gpu.address_space<private>>
}
scf.yield %40 : memref<?x1x?x?xf16, #gpu.address_space<private>>
}
%alloca_14 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_14 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%dim = memref.dim %39, %c0 : memref<?x1x?x?xf16, #gpu.address_space<private>>
%dim_15 = memref.dim %39, %c2 : memref<?x1x?x?xf16, #gpu.address_space<private>>
%dim_16 = memref.dim %39, %c3 : memref<?x1x?x?xf16, #gpu.address_space<private>>
%subview_17 = memref.subview %alloca_14[0, 0, 0, 0] [%dim, 1, %dim_15, %dim_16] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %39, %subview_17 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_14 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_8 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_8 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_9 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_9 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%25:2 = affine.delinearize_index %10 into (16, 16) : index, index
%subview_10 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
%27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
%30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
%31 = arith.cmpi eq, %30, %c0 : index
%32 = arith.ori %31, %28 : i1
%33 = scf.if %32 -> (memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
} else {
%39 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
%subview_13 = memref.subview %1[%26, %39] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_14 = memref.subview %subview_10[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_13, %subview_14 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.yield %subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%33 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_11 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_10, %subview_11 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%34 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%35 = vector.transpose %34, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_12 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%36 = vector.transfer_read %expand_shape_12[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%37 = vector.transpose %36, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%38 = iree_gpu.multi_mma %35, %37, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %38 : vector<1x1x1x1x4x1xf32>
}
%subview_6 = memref.subview %subview_4[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %9, %subview_6[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview_4[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_6, %subview_7 : memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
} {mapping = [#iree_gpu.lane_id<0>]}
%subview_5 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
%6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
%7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
%subview_4 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_5 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = scf.for %arg6 = %c0 to %7 step %c1 iter_args(%arg7 = %subview_5) -> (memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
%subview_7 = memref.subview %arg7[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_8 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_7 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
%subview_9 = memref.subview %arg7[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_7, %subview_9 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.yield %arg7 : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
%subview_6 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %8, %subview_6 : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%subview_3 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview, %subview_3 : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
memref.copy %2, %2 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
%subview_4 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.forall (%arg6) in (64) {
%6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
%7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
%8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
%11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
%13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
%14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
%15 = arith.cmpi eq, %14, %c0 : index
%16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
%17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
%18 = arith.cmpi eq, %17, %c0 : index
%19 = arith.ori %18, %15 : i1
%20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
%21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = arith.ori %22, %19 : i1
%24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_13 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg9 = %c0 to %14 step %c1 {
scf.for %arg10 = %c0 to %17 step %c1 {
scf.for %arg11 = %c0 to %21 step %c1 {
%38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
%39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
%40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
%subview_16 = memref.subview %subview_13[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_17 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
linalg.copy ins(%subview_16 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_17 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
%subview_18 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
memref.copy %subview_17, %subview_18 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
}
}
}
%alloca_14 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_14 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_15 = memref.subview %alloca_14[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_15 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_14 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_8 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_8 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_9 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_9 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%25:2 = affine.delinearize_index %10 into (16, 16) : index, index
%subview_10 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
%27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
%30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
%31 = arith.cmpi eq, %30, %c0 : index
%32 = arith.ori %31, %28 : i1
scf.if %32 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
%subview_13 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_10 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_14 = memref.subview %subview_10[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_13, %subview_14 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
%subview_11 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_10, %subview_11 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_12 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%35 = vector.transfer_read %expand_shape_12[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %37 : vector<1x1x1x1x4x1xf32>
}
%subview_6 = memref.subview %subview_4[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %9, %subview_6[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview_4[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_6, %subview_7 : memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
} {mapping = [#iree_gpu.lane_id<0>]}
%subview_5 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
%6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
%7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
%subview_4 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_5 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg6 = %c0 to %7 step %c1 {
%subview_7 = memref.subview %subview_5[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_8 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_7 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
%subview_9 = memref.subview %subview_5[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_7, %subview_9 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
%subview_6 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_5, %subview_6 : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%subview_3 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview, %subview_3 : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
%subview_3 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.forall (%arg6) in (64) {
%6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
%7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
%8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
%11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
%13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
%14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
%15 = arith.cmpi eq, %14, %c0 : index
%16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
%17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
%18 = arith.cmpi eq, %17, %c0 : index
%19 = arith.ori %18, %15 : i1
%20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
%21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = arith.ori %22, %19 : i1
%24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_8 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg9 = %c0 to %14 step %c1 {
scf.for %arg10 = %c0 to %17 step %c1 {
scf.for %arg11 = %c0 to %21 step %c1 {
%38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
%39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
%40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
%subview_11 = memref.subview %subview_8[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_12 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
linalg.copy ins(%subview_11 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
memref.copy %subview_12, %subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
}
}
}
%alloca_9 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_10 = memref.subview %alloca_9[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_10 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_5 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
memref.copy %subview_5, %subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%25:2 = affine.delinearize_index %10 into (16, 16) : index, index
%subview_6 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
%27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
%30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
%31 = arith.cmpi eq, %30, %c0 : index
%32 = arith.ori %31, %28 : i1
scf.if %32 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
%subview_8 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_9 = memref.subview %subview_6[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_9 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
memref.copy %subview_6, %subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_7 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%35 = vector.transfer_read %expand_shape_7[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %37 : vector<1x1x1x1x4x1xf32>
}
%subview_4 = memref.subview %subview_3[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %9, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_4 : memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
} {mapping = [#iree_gpu.lane_id<0>]}
memref.copy %subview_3, %subview_3 : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
%6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
%7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
%subview_3 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg6 = %c0 to %7 step %c1 {
%subview_5 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %subview_3[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_6 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.copy %subview_5, %subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
memref.copy %subview_4, %subview_4 : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
memref.copy %subview, %subview : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
%subview_3 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.forall (%arg6) in (64) {
%6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
%7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
%8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
%11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
%13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
%14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
%15 = arith.cmpi eq, %14, %c0 : index
%16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
%17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
%18 = arith.cmpi eq, %17, %c0 : index
%19 = arith.ori %18, %15 : i1
%20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
%21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = arith.ori %22, %19 : i1
%24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_8 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg9 = %c0 to %14 step %c1 {
scf.for %arg10 = %c0 to %17 step %c1 {
scf.for %arg11 = %c0 to %21 step %c1 {
%38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
%39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
%40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
%subview_11 = memref.subview %subview_8[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_12 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
linalg.copy ins(%subview_11 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
}
}
}
%alloca_9 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_10 = memref.subview %alloca_9[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_10 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_5 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%25:2 = affine.delinearize_index %10 into (16, 16) : index, index
%subview_6 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
%27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
%30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
%31 = arith.cmpi eq, %30, %c0 : index
%32 = arith.ori %31, %28 : i1
scf.if %32 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
%subview_8 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_9 = memref.subview %subview_6[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_9 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_7 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%35 = vector.transfer_read %expand_shape_7[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %37 : vector<1x1x1x1x4x1xf32>
}
%subview_4 = memref.subview %subview_3[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %9, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
} {mapping = [#iree_gpu.lane_id<0>]}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
%6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
%7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
%subview_3 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg6 = %c0 to %7 step %c1 {
%subview_5 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %subview_3[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_6 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
%subview_3 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.forall (%arg6) in (64) {
%6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
%7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
%8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
%11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
%13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
%14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
%15 = arith.cmpi eq, %14, %c0 : index
%16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
%17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
%18 = arith.cmpi eq, %17, %c0 : index
%19 = arith.ori %18, %15 : i1
%20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
%21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = arith.ori %22, %19 : i1
%24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_8 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg9 = %c0 to %14 step %c1 {
scf.for %arg10 = %c0 to %17 step %c1 {
scf.for %arg11 = %c0 to %21 step %c1 {
%38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
%39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
%40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
%subview_11 = memref.subview %subview_8[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_12 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
linalg.copy ins(%subview_11 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
}
}
}
%alloca_9 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_10 = memref.subview %alloca_9[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_10 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_5 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%25:2 = affine.delinearize_index %10 into (16, 16) : index, index
%subview_6 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
%27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
%30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
%31 = arith.cmpi eq, %30, %c0 : index
%32 = arith.ori %31, %28 : i1
scf.if %32 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
%subview_8 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_9 = memref.subview %subview_6[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_9 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_7 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%35 = vector.transfer_read %expand_shape_7[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %37 : vector<1x1x1x1x4x1xf32>
}
%subview_4 = memref.subview %subview_3[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %9, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
} {mapping = [#iree_gpu.lane_id<0>]}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
%6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
%7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
%subview_3 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg6 = %c0 to %7 step %c1 {
%subview_5 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %subview_3[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_6 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
%subview_3 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.forall (%arg6) in (64) {
%6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
%7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
%8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
%11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
%13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
%14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
%15 = arith.cmpi eq, %14, %c0 : index
%16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
%17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
%18 = arith.cmpi eq, %17, %c0 : index
%19 = arith.ori %18, %15 : i1
%20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
%21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = arith.ori %22, %19 : i1
%24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_8 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg9 = %c0 to %14 step %c1 {
scf.for %arg10 = %c0 to %17 step %c1 {
scf.for %arg11 = %c0 to %21 step %c1 {
%38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
%39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
%40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
%subview_11 = memref.subview %subview_8[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_12 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
linalg.copy ins(%subview_11 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
}
}
}
%alloca_9 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_10 = memref.subview %alloca_9[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_10 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_5 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%25:2 = affine.delinearize_index %10 into (16, 16) : index, index
%subview_6 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
%27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
%30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
%31 = arith.cmpi eq, %30, %c0 : index
%32 = arith.ori %31, %28 : i1
scf.if %32 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
%subview_8 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_9 = memref.subview %subview_6[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_9 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_7 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%35 = vector.transfer_read %expand_shape_7[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %37 : vector<1x1x1x1x4x1xf32>
}
%subview_4 = memref.subview %subview_3[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %9, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
} {mapping = [#iree_gpu.lane_id<0>]}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
%6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
%7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
%subview_3 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg6 = %c0 to %7 step %c1 {
%subview_5 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %subview_3[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_6 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
%subview_3 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.forall (%arg6) in (64) {
%6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
%7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
%8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
%11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
%13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
%14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
%15 = arith.cmpi eq, %14, %c0 : index
%16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
%17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
%18 = arith.cmpi eq, %17, %c0 : index
%19 = arith.ori %18, %15 : i1
%20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
%21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = arith.ori %22, %19 : i1
%24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_8 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg9 = %c0 to %14 step %c1 {
scf.for %arg10 = %c0 to %17 step %c1 {
scf.for %arg11 = %c0 to %21 step %c1 {
%38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
%39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
%40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
%subview_11 = memref.subview %subview_8[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_12 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
linalg.copy ins(%subview_11 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
}
}
}
%alloca_9 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_10 = memref.subview %alloca_9[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_10 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_5 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%25:2 = affine.delinearize_index %10 into (16, 16) : index, index
%subview_6 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
%27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
%30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
%31 = arith.cmpi eq, %30, %c0 : index
%32 = arith.ori %31, %28 : i1
scf.if %32 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
%subview_8 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_9 = memref.subview %subview_6[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_9 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_7 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%35 = vector.transfer_read %expand_shape_7[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %37 : vector<1x1x1x1x4x1xf32>
}
%subview_4 = memref.subview %subview_3[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %9, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
} {mapping = [#iree_gpu.lane_id<0>]}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
%6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
%7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
%subview_3 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg6 = %c0 to %7 step %c1 {
%subview_5 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %subview_3[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_6 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After NormalizeLoopBoundsPass (iree-codegen-normalize-loop-bounds) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
%subview_3 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.forall (%arg6) in (64) {
%6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
%7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
%8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
%11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
%13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
%14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
%15 = arith.cmpi eq, %14, %c0 : index
%16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
%17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
%18 = arith.cmpi eq, %17, %c0 : index
%19 = arith.ori %18, %15 : i1
%20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
%21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = arith.ori %22, %19 : i1
%24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_8 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg9 = %c0 to %14 step %c1 {
scf.for %arg10 = %c0 to %17 step %c1 {
scf.for %arg11 = %c0 to %21 step %c1 {
%38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
%39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
%40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
%subview_11 = memref.subview %subview_8[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_12 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
linalg.copy ins(%subview_11 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
}
}
}
%alloca_9 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_10 = memref.subview %alloca_9[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_10 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_5 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%25:2 = affine.delinearize_index %10 into (16, 16) : index, index
%subview_6 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
%27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
%30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
%31 = arith.cmpi eq, %30, %c0 : index
%32 = arith.ori %31, %28 : i1
scf.if %32 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
%subview_8 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_9 = memref.subview %subview_6[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_9 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_7 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%35 = vector.transfer_read %expand_shape_7[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %37 : vector<1x1x1x1x4x1xf32>
}
%subview_4 = memref.subview %subview_3[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %9, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
} {mapping = [#iree_gpu.lane_id<0>]}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
%6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
%7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
%subview_3 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg6 = %c0 to %7 step %c1 {
%subview_5 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %subview_3[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_6 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After GPUVerifyDistributionPass (iree-codegen-gpu-verify-distribution) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 2, 1) {
%subview_3 = memref.subview %alloc_2[%arg2, 0, %arg4, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.forall (%arg6) in (64) {
%6 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg6)
%7 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%arg6)
%8 = scf.for %arg7 = %c0 to %c721 step %c1 iter_args(%arg8 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%arg6, %arg4, %arg2)
%11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
%13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
%14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
%15 = arith.cmpi eq, %14, %c0 : index
%16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
%17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
%18 = arith.cmpi eq, %17, %c0 : index
%19 = arith.ori %18, %15 : i1
%20 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg7, %11#2)
%21 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = arith.ori %22, %19 : i1
%24 = scf.if %23 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_8 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%14, %17, %21) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg9 = %c0 to %14 step %c1 {
scf.for %arg10 = %c0 to %17 step %c1 {
scf.for %arg11 = %c0 to %21 step %c1 {
%38 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%20, %arg11)
%39 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg10, %arg0, %16, %20, %arg11)
%40 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg10, %arg0, %16, %20, %arg11)
%subview_11 = memref.subview %subview_8[%arg9, %39, %40, %38] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_12 = memref.subview %alloca[%arg9, 0, %arg10, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
linalg.copy ins(%subview_11 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_12 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
}
}
}
%alloca_9 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_10 = memref.subview %alloca_9[0, 0, 0, 0] [%14, 1, %17, %21] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_10 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_9 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_5 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%24 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_5 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%25:2 = affine.delinearize_index %10 into (16, 16) : index, index
%subview_6 = memref.subview %alloc_1[%25#0, %25#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%26 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg7)[%25#0]
%27 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%25#1, %arg1)
%30 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %29)
%31 = arith.cmpi eq, %30, %c0 : index
%32 = arith.ori %31, %28 : i1
scf.if %32 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%38 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%29]
%subview_8 = memref.subview %1[%26, %38] [%27, %30] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_6 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_9 = memref.subview %subview_6[0, 0] [%27, %30] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_9 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%33 = vector.transfer_read %expand_shape[%arg2, %c0, %arg4, %6, %c0, %7], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%34 = vector.transpose %33, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_7 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%35 = vector.transfer_read %expand_shape_7[%c0, %7, %c0, %6], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%36 = vector.transpose %35, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%37 = iree_gpu.multi_mma %34, %36, %arg8 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %37 : vector<1x1x1x1x4x1xf32>
}
%subview_4 = memref.subview %subview_3[0, 0, 0, %7, 0, %6] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%9 = vector.transpose %8, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %9, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
} {mapping = [#iree_gpu.lane_id<0>]}
} {mapping = [#gpu.warp<linear_dim_3>, #gpu.warp<linear_dim_2>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
%collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
scf.forall (%arg2, %arg3, %arg4, %arg5) in (2, 1, 17, %5) {
%6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
%7 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%arg5)[%4]
%subview_3 = memref.subview %collapse_shape[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_4 = memref.subview %subview[%arg2, 0, %arg4, %6] [1, 1, 1, %7] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg6 = %c0 to %7 step %c1 {
%subview_5 = memref.subview %subview_4[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %subview_3[0, 0, 0, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_6 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_5 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
} {mapping = [#gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After GPUDistributeForallPass (iree-codegen-gpu-distribute-forall) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%thread_id_z = gpu.thread_id z
%thread_id_y = gpu.thread_id y
%thread_id_x = gpu.thread_id x
%0 = affine.linearize_index disjoint [%thread_id_z, %thread_id_y, %thread_id_x] by (1, 1, 256) : index
%cst = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%c721 = arith.constant 721 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%4 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%5 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_1 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
%6:2 = affine.delinearize_index %0 into (4, 64) : index, index
%c2 = arith.constant 2 : index
%c1_3 = arith.constant 1 : index
%c2_4 = arith.constant 2 : index
%c1_5 = arith.constant 1 : index
%c0_6 = arith.constant 0 : index
%c4 = arith.constant 4 : index
%c4_7 = arith.constant 4 : index
gpu.barrier
scf.for %arg2 = %c0_6 to %c4 step %c4_7 {
%9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %6#0)
%10:4 = affine.delinearize_index %9 into (2, 1, 2, 1) : index, index, index, index
%subview_10 = memref.subview %alloc_2[%10#0, 0, %10#2, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%11 = gpu.lane_id
%12 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%11)
%13 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%11)
%14 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %cst) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%16 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%11, %10#2, %10#0)
%17:3 = affine.delinearize_index %16 into (2, 32, 4) : index, index, index
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%17#2)
%19 = affine.min affine_map<(d0) -> (2, d0)>(%17#0)
%20 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%19)
%21 = arith.cmpi eq, %20, %c0 : index
%22 = affine.min affine_map<(d0) -> (17, d0)>(%17#1)
%23 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%22)
%24 = arith.cmpi eq, %23, %c0 : index
%25 = arith.ori %24, %21 : i1
%26 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg3, %17#2)
%27 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %25 : i1
%30 = scf.if %29 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_15 = memref.subview %1[%19, 0, 0, 0] [%20, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%20, %23, %27) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg5 = %c0 to %20 step %c1 {
scf.for %arg6 = %c0 to %23 step %c1 {
scf.for %arg7 = %c0 to %27 step %c1 {
%44 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%26, %arg7)
%45 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg6, %arg0, %22, %26, %arg7)
%46 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg6, %arg0, %22, %26, %arg7)
%subview_18 = memref.subview %subview_15[%arg5, %45, %46, %44] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_19 = memref.subview %alloca[%arg5, 0, %arg6, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
linalg.copy ins(%subview_18 : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_19 : memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>)
}
}
}
%alloca_16 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_16 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_17 = memref.subview %alloca_16[0, 0, 0, 0] [%20, 1, %23, %27] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_17 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_16 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_12 = memref.subview %alloc[%17#0, 0, %17#1, %18] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%30 : memref<1x1x1x4xf16, #gpu.address_space<private>>) outs(%subview_12 : memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%31:2 = affine.delinearize_index %16 into (16, 16) : index, index
%subview_13 = memref.subview %alloc_1[%31#0, %31#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%32 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%31#0]
%33 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%32)
%34 = arith.cmpi eq, %33, %c0 : index
%35 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%31#1, %arg1)
%36 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%5, %35)
%37 = arith.cmpi eq, %36, %c0 : index
%38 = arith.ori %37, %34 : i1
scf.if %38 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_13 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%44 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%35]
%subview_15 = memref.subview %2[%32, %44] [%33, %36] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_13 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_16 = memref.subview %subview_13[0, 0] [%33, %36] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_15, %subview_16 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%39 = vector.transfer_read %expand_shape[%10#0, %c0, %10#2, %12, %c0, %13], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%40 = vector.transpose %39, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_14 = memref.expand_shape %alloc_1 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%41 = vector.transfer_read %expand_shape_14[%c0, %13, %c0, %12], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%42 = vector.transpose %41, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%43 = iree_gpu.multi_mma %40, %42, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %43 : vector<1x1x1x1x4x1xf32>
}
%subview_11 = memref.subview %subview_10[0, 0, 0, %13, 0, %12] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%15 = vector.transpose %14, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %15, %subview_11[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%collapse_shape = memref.collapse_shape %alloc_2 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview = memref.subview %3[0, %arg0, 0, %4] [2, 1, 17, %5] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%5)
%c2_8 = arith.constant 2 : index
%c1_9 = arith.constant 1 : index
%c17 = arith.constant 17 : index
%8 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%5)
%c256 = arith.constant 256 : index
gpu.barrier
scf.for %arg2 = %0 to %8 step %c256 {
%9:4 = affine.delinearize_index %arg2 into (2, 1, 17, %7) : index, index, index, index
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%9#3)
%11 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%9#3)[%5]
%subview_10 = memref.subview %collapse_shape[%9#0, 0, %9#2, %10] [1, 1, 1, %11] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_11 = memref.subview %subview[%9#0, 0, %9#2, %10] [1, 1, 1, %11] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %11 step %c1 {
%subview_12 = memref.subview %subview_11[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_13 = memref.subview %subview_10[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%subview_13 : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_12 : memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
}
}
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After VectorizeMemrefCopyPass (iree-codegen-vectorize-memref-copy) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c256 = arith.constant 256 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c1 = arith.constant 1 : index
%c721 = arith.constant 721 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%thread_id_z = gpu.thread_id z
%thread_id_y = gpu.thread_id y
%thread_id_x = gpu.thread_id x
%0 = affine.linearize_index disjoint [%thread_id_z, %thread_id_y, %thread_id_x] by (1, 1, 256) : index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%4 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%5 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
%6:2 = affine.delinearize_index %0 into (4, 64) : index, index
gpu.barrier
scf.for %arg2 = %c0 to %c4 step %c4 {
%9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %6#0)
%10:4 = affine.delinearize_index %9 into (2, 1, 2, 1) : index, index, index, index
%subview_4 = memref.subview %alloc_3[%10#0, 0, %10#2, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%11 = gpu.lane_id
%12 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%11)
%13 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%11)
%14 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%16 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%11, %10#2, %10#0)
%17:3 = affine.delinearize_index %16 into (2, 32, 4) : index, index, index
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%17#2)
%19 = affine.min affine_map<(d0) -> (2, d0)>(%17#0)
%20 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%19)
%21 = arith.cmpi eq, %20, %c0 : index
%22 = affine.min affine_map<(d0) -> (17, d0)>(%17#1)
%23 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%22)
%24 = arith.cmpi eq, %23, %c0 : index
%25 = arith.ori %24, %21 : i1
%26 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg3, %17#2)
%27 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %25 : i1
%30 = scf.if %29 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_9 = memref.subview %1[%19, 0, 0, 0] [%20, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%20, %23, %27) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg5 = %c0 to %20 step %c1 {
scf.for %arg6 = %c0 to %23 step %c1 {
scf.for %arg7 = %c0 to %27 step %c1 {
%45 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%26, %arg7)
%46 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg6, %arg0, %22, %26, %arg7)
%47 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg6, %arg0, %22, %26, %arg7)
%subview_12 = memref.subview %subview_9[%arg5, %46, %47, %45] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_13 = memref.subview %alloca[%arg5, 0, %arg6, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
%48 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
vector.transfer_write %48, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
}
}
}
%alloca_10 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_11 = memref.subview %alloca_10[0, 0, 0, 0] [%20, 1, %23, %27] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_11 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_6 = memref.subview %alloc[%17#0, 0, %17#1, %18] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%31 = vector.transfer_read %30[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
vector.transfer_write %31, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%32:2 = affine.delinearize_index %16 into (16, 16) : index, index
%subview_7 = memref.subview %alloc_2[%32#0, %32#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%33 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%32#0]
%34 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%33)
%35 = arith.cmpi eq, %34, %c0 : index
%36 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%32#1, %arg1)
%37 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%5, %36)
%38 = arith.cmpi eq, %37, %c0 : index
%39 = arith.ori %38, %35 : i1
scf.if %39 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%45 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%36]
%subview_9 = memref.subview %2[%33, %45] [%34, %37] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_10 = memref.subview %subview_7[0, 0] [%34, %37] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_9, %subview_10 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%40 = vector.transfer_read %expand_shape[%10#0, %c0, %10#2, %12, %c0, %13], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%41 = vector.transpose %40, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_8 = memref.expand_shape %alloc_2 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%42 = vector.transfer_read %expand_shape_8[%c0, %13, %c0, %12], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%43 = vector.transpose %42, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%44 = iree_gpu.multi_mma %41, %43, %arg4 {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x1x1x1x1x4xf16>, vector<1x1x1x4xf16> into vector<1x1x1x1x4x1xf32>
scf.yield %44 : vector<1x1x1x1x4x1xf32>
}
%subview_5 = memref.subview %subview_4[0, 0, 0, %13, 0, %12] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%15 = vector.transpose %14, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %15, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%collapse_shape = memref.collapse_shape %alloc_3 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview = memref.subview %3[0, %arg0, 0, %4] [2, 1, 17, %5] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%5)
%8 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%5)
gpu.barrier
scf.for %arg2 = %0 to %8 step %c256 {
%9:4 = affine.delinearize_index %arg2 into (2, 1, 17, %7) : index, index, index, index
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%9#3)
%11 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%9#3)[%5]
%subview_4 = memref.subview %collapse_shape[%9#0, 0, %9#2, %10] [1, 1, 1, %11] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_5 = memref.subview %subview[%9#0, 0, %9#2, %10] [1, 1, 1, %11] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %11 step %c1 {
%subview_6 = memref.subview %subview_5[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_7 = memref.subview %subview_4[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%12 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
vector.transfer_write %12, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After UnrollToIntrinsicsPass (iree-gpu-unroll-to-intrinsics) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c256 = arith.constant 256 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c1 = arith.constant 1 : index
%c721 = arith.constant 721 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%thread_id_z = gpu.thread_id z
%thread_id_y = gpu.thread_id y
%thread_id_x = gpu.thread_id x
%0 = affine.linearize_index disjoint [%thread_id_z, %thread_id_y, %thread_id_x] by (1, 1, 256) : index
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%4 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%5 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
%6:2 = affine.delinearize_index %0 into (4, 64) : index, index
gpu.barrier
scf.for %arg2 = %c0 to %c4 step %c4 {
%9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %6#0)
%10:4 = affine.delinearize_index %9 into (2, 1, 2, 1) : index, index, index, index
%subview_4 = memref.subview %alloc_3[%10#0, 0, %10#2, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%11 = gpu.lane_id
%12 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%11)
%13 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%11)
%14 = scf.for %arg3 = %c0 to %c721 step %c1 iter_args(%arg4 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%16 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%11, %10#2, %10#0)
%17:3 = affine.delinearize_index %16 into (2, 32, 4) : index, index, index
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%17#2)
%19 = affine.min affine_map<(d0) -> (2, d0)>(%17#0)
%20 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%19)
%21 = arith.cmpi eq, %20, %c0 : index
%22 = affine.min affine_map<(d0) -> (17, d0)>(%17#1)
%23 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%22)
%24 = arith.cmpi eq, %23, %c0 : index
%25 = arith.ori %24, %21 : i1
%26 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg3, %17#2)
%27 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %25 : i1
%30 = scf.if %29 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_9 = memref.subview %1[%19, 0, 0, 0] [%20, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%20, %23, %27) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg5 = %c0 to %20 step %c1 {
scf.for %arg6 = %c0 to %23 step %c1 {
scf.for %arg7 = %c0 to %27 step %c1 {
%49 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%26, %arg7)
%50 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg6, %arg0, %22, %26, %arg7)
%51 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg6, %arg0, %22, %26, %arg7)
%subview_12 = memref.subview %subview_9[%arg5, %50, %51, %49] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_13 = memref.subview %alloca[%arg5, 0, %arg6, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
%52 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
vector.transfer_write %52, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
}
}
}
%alloca_10 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_11 = memref.subview %alloca_10[0, 0, 0, 0] [%20, 1, %23, %27] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_11 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_6 = memref.subview %alloc[%17#0, 0, %17#1, %18] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%31 = vector.transfer_read %30[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
vector.transfer_write %31, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%32:2 = affine.delinearize_index %16 into (16, 16) : index, index
%subview_7 = memref.subview %alloc_2[%32#0, %32#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%33 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg3)[%32#0]
%34 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%33)
%35 = arith.cmpi eq, %34, %c0 : index
%36 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%32#1, %arg1)
%37 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%5, %36)
%38 = arith.cmpi eq, %37, %c0 : index
%39 = arith.ori %38, %35 : i1
scf.if %39 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%49 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%36]
%subview_9 = memref.subview %2[%33, %49] [%34, %37] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_10 = memref.subview %subview_7[0, 0] [%34, %37] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_9, %subview_10 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%40 = vector.transfer_read %expand_shape[%10#0, %c0, %10#2, %12, %c0, %13], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%41 = vector.transpose %40, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_8 = memref.expand_shape %alloc_2 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%42 = vector.transfer_read %expand_shape_8[%c0, %13, %c0, %12], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%43 = vector.transpose %42, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%44 = vector.extract %41[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
%45 = vector.extract %43[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
%46 = vector.extract %arg4[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
%47 = iree_gpu.multi_mma %44, %45, %46 {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = [], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x4xf16>, vector<1x4xf16> into vector<4x1xf32>
%48 = vector.broadcast %47 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
scf.yield %48 : vector<1x1x1x1x4x1xf32>
}
%subview_5 = memref.subview %subview_4[0, 0, 0, %13, 0, %12] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%15 = vector.transpose %14, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %15, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%collapse_shape = memref.collapse_shape %alloc_3 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview = memref.subview %3[0, %arg0, 0, %4] [2, 1, 17, %5] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%7 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%5)
%8 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%5)
gpu.barrier
scf.for %arg2 = %0 to %8 step %c256 {
%9:4 = affine.delinearize_index %arg2 into (2, 1, 17, %7) : index, index, index, index
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%9#3)
%11 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%9#3)[%5]
%subview_4 = memref.subview %collapse_shape[%9#0, 0, %9#2, %10] [1, 1, 1, %11] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_5 = memref.subview %subview[%9#0, 0, %9#2, %10] [1, 1, 1, %11] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %11 step %c1 {
%subview_6 = memref.subview %subview_5[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_7 = memref.subview %subview_4[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%12 = vector.transfer_read %subview_7[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
vector.transfer_write %12, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c1 = arith.constant 1 : index
%c721 = arith.constant 721 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%thread_id_x = gpu.thread_id x
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
%5:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
gpu.barrier
%6:2 = affine.delinearize_index %5#0 into (2, 2) : index, index
%subview = memref.subview %alloc_3[%6#0, 0, %6#1, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%7 = gpu.lane_id
%8 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%7)
%9 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%7)
%10 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%14 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%7, %6#1, %6#0)
%15:3 = affine.delinearize_index %14 into (2, 32, 4) : index, index, index
%16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%15#2)
%17 = affine.min affine_map<(d0) -> (2, d0)>(%15#0)
%18 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%17)
%19 = arith.cmpi eq, %18, %c0 : index
%20 = affine.min affine_map<(d0) -> (17, d0)>(%15#1)
%21 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = arith.ori %22, %19 : i1
%24 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %15#2)
%25 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%24)
%26 = arith.cmpi eq, %25, %c0 : index
%27 = arith.ori %26, %23 : i1
%28 = scf.if %27 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_9 = memref.subview %0[%17, 0, 0, 0] [%18, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%18, %21, %25) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg4 = %c0 to %18 step %c1 {
scf.for %arg5 = %c0 to %21 step %c1 {
scf.for %arg6 = %c0 to %25 step %c1 {
%47 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%24, %arg6)
%48 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %20, %24, %arg6)
%49 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %20, %24, %arg6)
%subview_12 = memref.subview %subview_9[%arg4, %48, %49, %47] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_13 = memref.subview %alloca[%arg4, 0, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
%50 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
vector.transfer_write %50, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
}
}
}
%alloca_10 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_11 = memref.subview %alloca_10[0, 0, 0, 0] [%18, 1, %21, %25] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_11 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_6 = memref.subview %alloc[%15#0, 0, %15#1, %16] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%29 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
vector.transfer_write %29, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%30:2 = affine.delinearize_index %14 into (16, 16) : index, index
%subview_7 = memref.subview %alloc_2[%30#0, %30#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%31 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%30#0]
%32 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%31)
%33 = arith.cmpi eq, %32, %c0 : index
%34 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%30#1, %arg1)
%35 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = arith.ori %36, %33 : i1
scf.if %37 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%47 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%34]
%subview_9 = memref.subview %1[%31, %47] [%32, %35] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_10 = memref.subview %subview_7[0, 0] [%32, %35] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_9, %subview_10 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%38 = vector.transfer_read %expand_shape[%6#0, %c0, %6#1, %8, %c0, %9], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%39 = vector.transpose %38, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_8 = memref.expand_shape %alloc_2 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%40 = vector.transfer_read %expand_shape_8[%c0, %9, %c0, %8], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%41 = vector.transpose %40, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%42 = vector.extract %39[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
%43 = vector.extract %41[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
%44 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
%45 = iree_gpu.multi_mma %42, %43, %44 {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = [], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x4xf16>, vector<1x4xf16> into vector<4x1xf32>
%46 = vector.broadcast %45 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
scf.yield %46 : vector<1x1x1x1x4x1xf32>
}
%subview_4 = memref.subview %subview[0, 0, 0, %9, 0, %8] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%11 = vector.transpose %10, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %11, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%collapse_shape = memref.collapse_shape %alloc_3 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview_5 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%12 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
%13 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%4)
gpu.barrier
scf.for %arg2 = %thread_id_x to %13 step %c256 {
%14:3 = affine.delinearize_index %arg2 into (2, 17, %12) : index, index, index
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%14#2)
%16 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%14#2)[%4]
%subview_6 = memref.subview %collapse_shape[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview_5[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %16 step %c1 {
%subview_8 = memref.subview %subview_7[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_9 = memref.subview %subview_6[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%17 = vector.transfer_read %subview_9[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
vector.transfer_write %17, %subview_8[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c1 = arith.constant 1 : index
%c721 = arith.constant 721 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%thread_id_x = gpu.thread_id x
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
%5:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
gpu.barrier
%6:2 = affine.delinearize_index %5#0 into (2, 2) : index, index
%subview = memref.subview %alloc_3[%6#0, 0, %6#1, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%7 = gpu.lane_id
%8 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%7)
%9 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%7)
%10 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%14 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%7, %6#1, %6#0)
%15:3 = affine.delinearize_index %14 into (2, 32, 4) : index, index, index
%16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%15#2)
%17 = affine.min affine_map<(d0) -> (2, d0)>(%15#0)
%18 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%17)
%19 = arith.cmpi eq, %18, %c0 : index
%20 = affine.min affine_map<(d0) -> (17, d0)>(%15#1)
%21 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = arith.ori %22, %19 : i1
%24 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %15#2)
%25 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%24)
%26 = arith.cmpi eq, %25, %c0 : index
%27 = arith.ori %26, %23 : i1
%28 = scf.if %27 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_9 = memref.subview %0[%17, 0, 0, 0] [%18, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%18, %21, %25) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg4 = %c0 to %18 step %c1 {
scf.for %arg5 = %c0 to %21 step %c1 {
scf.for %arg6 = %c0 to %25 step %c1 {
%47 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%24, %arg6)
%48 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %20, %24, %arg6)
%49 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %20, %24, %arg6)
%subview_12 = memref.subview %subview_9[%arg4, %48, %49, %47] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_13 = memref.subview %alloca[%arg4, 0, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
%50 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
vector.transfer_write %50, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
}
}
}
%alloca_10 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_11 = memref.subview %alloca_10[0, 0, 0, 0] [%18, 1, %21, %25] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_11 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_6 = memref.subview %alloc[%15#0, 0, %15#1, %16] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%29 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
vector.transfer_write %29, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%30:2 = affine.delinearize_index %14 into (16, 16) : index, index
%subview_7 = memref.subview %alloc_2[%30#0, %30#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%31 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%30#0]
%32 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%31)
%33 = arith.cmpi eq, %32, %c0 : index
%34 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%30#1, %arg1)
%35 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = arith.ori %36, %33 : i1
scf.if %37 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%47 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%34]
%subview_9 = memref.subview %1[%31, %47] [%32, %35] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_10 = memref.subview %subview_7[0, 0] [%32, %35] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_9, %subview_10 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%38 = vector.transfer_read %expand_shape[%6#0, %c0, %6#1, %8, %c0, %9], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%39 = vector.transpose %38, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_8 = memref.expand_shape %alloc_2 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%40 = vector.transfer_read %expand_shape_8[%c0, %9, %c0, %8], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%41 = vector.transpose %40, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%42 = vector.extract %39[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
%43 = vector.extract %41[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
%44 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
%45 = iree_gpu.multi_mma %42, %43, %44 {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = [], kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<1x4xf16>, vector<1x4xf16> into vector<4x1xf32>
%46 = vector.broadcast %45 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
scf.yield %46 : vector<1x1x1x1x4x1xf32>
}
%subview_4 = memref.subview %subview[0, 0, 0, %9, 0, %8] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%11 = vector.transpose %10, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %11, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%collapse_shape = memref.collapse_shape %alloc_3 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview_5 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%12 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
%13 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%4)
gpu.barrier
scf.for %arg2 = %thread_id_x to %13 step %c256 {
%14:3 = affine.delinearize_index %arg2 into (2, 17, %12) : index, index, index
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%14#2)
%16 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%14#2)[%4]
%subview_6 = memref.subview %collapse_shape[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview_5[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %16 step %c1 {
%subview_8 = memref.subview %subview_7[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_9 = memref.subview %subview_6[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%17 = vector.transfer_read %subview_9[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
vector.transfer_write %17, %subview_8[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After LowerIREEGPUOpsPass (iree-gpu-lower-ops) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c1 = arith.constant 1 : index
%c721 = arith.constant 721 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%thread_id_x = gpu.thread_id x
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
%5:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
gpu.barrier
%6:2 = affine.delinearize_index %5#0 into (2, 2) : index, index
%subview = memref.subview %alloc_3[%6#0, 0, %6#1, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%7 = gpu.lane_id
%8 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%7)
%9 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%7)
%10 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%14 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%7, %6#1, %6#0)
%15:3 = affine.delinearize_index %14 into (2, 32, 4) : index, index, index
%16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%15#2)
%17 = affine.min affine_map<(d0) -> (2, d0)>(%15#0)
%18 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%17)
%19 = arith.cmpi eq, %18, %c0 : index
%20 = affine.min affine_map<(d0) -> (17, d0)>(%15#1)
%21 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = arith.ori %22, %19 : i1
%24 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %15#2)
%25 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%24)
%26 = arith.cmpi eq, %25, %c0 : index
%27 = arith.ori %26, %23 : i1
%28 = scf.if %27 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_9 = memref.subview %0[%17, 0, 0, 0] [%18, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%18, %21, %25) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg4 = %c0 to %18 step %c1 {
scf.for %arg5 = %c0 to %21 step %c1 {
scf.for %arg6 = %c0 to %25 step %c1 {
%51 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%24, %arg6)
%52 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %20, %24, %arg6)
%53 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %20, %24, %arg6)
%subview_12 = memref.subview %subview_9[%arg4, %52, %53, %51] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_13 = memref.subview %alloca[%arg4, 0, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
%54 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
vector.transfer_write %54, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
}
}
}
%alloca_10 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_11 = memref.subview %alloca_10[0, 0, 0, 0] [%18, 1, %21, %25] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_11 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_6 = memref.subview %alloc[%15#0, 0, %15#1, %16] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%29 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
vector.transfer_write %29, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%30:2 = affine.delinearize_index %14 into (16, 16) : index, index
%subview_7 = memref.subview %alloc_2[%30#0, %30#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%31 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%30#0]
%32 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%31)
%33 = arith.cmpi eq, %32, %c0 : index
%34 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%30#1, %arg1)
%35 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = arith.ori %36, %33 : i1
scf.if %37 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%51 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%34]
%subview_9 = memref.subview %1[%31, %51] [%32, %35] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_10 = memref.subview %subview_7[0, 0] [%32, %35] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_9, %subview_10 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%38 = vector.transfer_read %expand_shape[%6#0, %c0, %6#1, %8, %c0, %9], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%39 = vector.transpose %38, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_8 = memref.expand_shape %alloc_2 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%40 = vector.transfer_read %expand_shape_8[%c0, %9, %c0, %8], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%41 = vector.transpose %40, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%42 = vector.extract %39[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
%43 = vector.extract %41[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
%44 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
%45 = vector.shape_cast %42 : vector<1x4xf16> to vector<4xf16>
%46 = vector.shape_cast %43 : vector<1x4xf16> to vector<4xf16>
%47 = vector.shape_cast %44 : vector<4x1xf32> to vector<4xf32>
%48 = amdgpu.mfma %45 * %46 + %47 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%49 = vector.shape_cast %48 : vector<4xf32> to vector<4x1xf32>
%50 = vector.broadcast %49 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
scf.yield %50 : vector<1x1x1x1x4x1xf32>
}
%subview_4 = memref.subview %subview[0, 0, 0, %9, 0, %8] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%11 = vector.transpose %10, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %11, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%collapse_shape = memref.collapse_shape %alloc_3 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview_5 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%12 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
%13 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%4)
gpu.barrier
scf.for %arg2 = %thread_id_x to %13 step %c256 {
%14:3 = affine.delinearize_index %arg2 into (2, 17, %12) : index, index, index
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%14#2)
%16 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%14#2)[%4]
%subview_6 = memref.subview %collapse_shape[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview_5[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %16 step %c1 {
%subview_8 = memref.subview %subview_7[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_9 = memref.subview %subview_6[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%17 = vector.transfer_read %subview_9[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
vector.transfer_write %17, %subview_8[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After UnrollAnnotatedLoopsPass (iree-codegen-unroll-annotated-loops) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c1 = arith.constant 1 : index
%c721 = arith.constant 721 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%thread_id_x = gpu.thread_id x
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
%5:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
gpu.barrier
%6:2 = affine.delinearize_index %5#0 into (2, 2) : index, index
%subview = memref.subview %alloc_3[%6#0, 0, %6#1, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%7 = gpu.lane_id
%8 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%7)
%9 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%7)
%10 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%14 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%7, %6#1, %6#0)
%15:3 = affine.delinearize_index %14 into (2, 32, 4) : index, index, index
%16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%15#2)
%17 = affine.min affine_map<(d0) -> (2, d0)>(%15#0)
%18 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%17)
%19 = arith.cmpi eq, %18, %c0 : index
%20 = affine.min affine_map<(d0) -> (17, d0)>(%15#1)
%21 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%20)
%22 = arith.cmpi eq, %21, %c0 : index
%23 = arith.ori %22, %19 : i1
%24 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %15#2)
%25 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%24)
%26 = arith.cmpi eq, %25, %c0 : index
%27 = arith.ori %26, %23 : i1
%28 = scf.if %27 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_9 = memref.subview %0[%17, 0, 0, 0] [%18, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%18, %21, %25) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg4 = %c0 to %18 step %c1 {
scf.for %arg5 = %c0 to %21 step %c1 {
scf.for %arg6 = %c0 to %25 step %c1 {
%51 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%24, %arg6)
%52 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %20, %24, %arg6)
%53 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %20, %24, %arg6)
%subview_12 = memref.subview %subview_9[%arg4, %52, %53, %51] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_13 = memref.subview %alloca[%arg4, 0, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
%54 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
vector.transfer_write %54, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
}
}
}
%alloca_10 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_11 = memref.subview %alloca_10[0, 0, 0, 0] [%18, 1, %21, %25] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_11 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%subview_6 = memref.subview %alloc[%15#0, 0, %15#1, %16] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%29 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
vector.transfer_write %29, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%30:2 = affine.delinearize_index %14 into (16, 16) : index, index
%subview_7 = memref.subview %alloc_2[%30#0, %30#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%31 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%30#0]
%32 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%31)
%33 = arith.cmpi eq, %32, %c0 : index
%34 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%30#1, %arg1)
%35 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = arith.ori %36, %33 : i1
scf.if %37 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%51 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%34]
%subview_9 = memref.subview %1[%31, %51] [%32, %35] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_7 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_10 = memref.subview %subview_7[0, 0] [%32, %35] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_9, %subview_10 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%38 = vector.transfer_read %expand_shape[%6#0, %c0, %6#1, %8, %c0, %9], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%39 = vector.transpose %38, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%expand_shape_8 = memref.expand_shape %alloc_2 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%40 = vector.transfer_read %expand_shape_8[%c0, %9, %c0, %8], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%41 = vector.transpose %40, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%42 = vector.extract %39[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
%43 = vector.extract %41[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
%44 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
%45 = vector.shape_cast %42 : vector<1x4xf16> to vector<4xf16>
%46 = vector.shape_cast %43 : vector<1x4xf16> to vector<4xf16>
%47 = vector.shape_cast %44 : vector<4x1xf32> to vector<4xf32>
%48 = amdgpu.mfma %45 * %46 + %47 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%49 = vector.shape_cast %48 : vector<4xf32> to vector<4x1xf32>
%50 = vector.broadcast %49 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
scf.yield %50 : vector<1x1x1x1x4x1xf32>
}
%subview_4 = memref.subview %subview[0, 0, 0, %9, 0, %8] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%11 = vector.transpose %10, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %11, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%collapse_shape = memref.collapse_shape %alloc_3 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview_5 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%12 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
%13 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%4)
gpu.barrier
scf.for %arg2 = %thread_id_x to %13 step %c256 {
%14:3 = affine.delinearize_index %arg2 into (2, 17, %12) : index, index, index
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%14#2)
%16 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%14#2)[%4]
%subview_6 = memref.subview %collapse_shape[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_7 = memref.subview %subview_5[%14#0, 0, %14#1, %15] [1, 1, 1, %16] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %16 step %c1 {
%subview_8 = memref.subview %subview_7[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_9 = memref.subview %subview_6[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%17 = vector.transfer_read %subview_9[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
vector.transfer_write %17, %subview_8[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c1 = arith.constant 1 : index
%c721 = arith.constant 721 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%thread_id_x = gpu.thread_id x
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x16xf16, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<16x16xf16, #gpu.address_space<workgroup>>
%alloc_3 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
%5:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
gpu.barrier
%6:2 = affine.delinearize_index %5#0 into (2, 2) : index, index
%subview = memref.subview %alloc_3[%6#0, 0, %6#1, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%7 = gpu.lane_id
%8 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%7)
%9 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%7)
%10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%7, %6#1, %6#0)
%11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
%13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
%14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
%15 = arith.cmpi eq, %14, %c0 : index
%16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
%17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
%18 = arith.cmpi eq, %17, %c0 : index
%19 = arith.ori %18, %15 : i1
%subview_4 = memref.subview %alloc[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%20:2 = affine.delinearize_index %10 into (16, 16) : index, index
%subview_5 = memref.subview %alloc_2[%20#0, %20#1] [1, 1] [1, 1] : memref<16x16xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
%21 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%20#1, %arg1)
%22 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %21)
%23 = arith.cmpi eq, %22, %c0 : index
%expand_shape = memref.expand_shape %alloc [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>
%expand_shape_6 = memref.expand_shape %alloc_2 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, #gpu.address_space<workgroup>>
%24 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%28 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %11#2)
%29 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%28)
%30 = arith.cmpi eq, %29, %c0 : index
%31 = arith.ori %30, %19 : i1
%32 = scf.if %31 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_9 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%14, %17, %29) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg4 = %c0 to %14 step %c1 {
scf.for %arg5 = %c0 to %17 step %c1 {
scf.for %arg6 = %c0 to %29 step %c1 {
%51 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%28, %arg6)
%52 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %16, %28, %arg6)
%53 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %16, %28, %arg6)
%subview_12 = memref.subview %subview_9[%arg4, %52, %53, %51] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_13 = memref.subview %alloca[%arg4, 0, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
%54 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
vector.transfer_write %54, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
}
}
}
%alloca_10 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_11 = memref.subview %alloca_10[0, 0, 0, 0] [%14, 1, %17, %29] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_11 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_10 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%33 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
vector.transfer_write %33, %subview_4[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%34 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%20#0]
%35 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = arith.ori %23, %36 : i1
scf.if %37 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_5 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%51 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%21]
%subview_9 = memref.subview %1[%34, %51] [%35, %22] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_5 : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_10 = memref.subview %subview_5[0, 0] [%35, %22] [1, 1] : memref<1x1xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_9, %subview_10 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%38 = vector.transfer_read %expand_shape[%6#0, %c0, %6#1, %8, %c0, %9], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%39 = vector.transpose %38, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%40 = vector.transfer_read %expand_shape_6[%c0, %9, %c0, %8], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%41 = vector.transpose %40, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%42 = vector.extract %39[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
%43 = vector.extract %41[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
%44 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
%45 = vector.shape_cast %42 : vector<1x4xf16> to vector<4xf16>
%46 = vector.shape_cast %43 : vector<1x4xf16> to vector<4xf16>
%47 = vector.shape_cast %44 : vector<4x1xf32> to vector<4xf32>
%48 = amdgpu.mfma %45 * %46 + %47 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%49 = vector.shape_cast %48 : vector<4xf32> to vector<4x1xf32>
%50 = vector.broadcast %49 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
scf.yield %50 : vector<1x1x1x1x4x1xf32>
}
%subview_7 = memref.subview %subview[0, 0, 0, %9, 0, %8] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%25 = vector.transpose %24, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %25, %subview_7[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%collapse_shape = memref.collapse_shape %alloc_3 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview_8 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%26 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
%27 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%4)
gpu.barrier
scf.for %arg2 = %thread_id_x to %27 step %c256 {
%28:3 = affine.delinearize_index %arg2 into (2, 17, %26) : index, index, index
%29 = affine.apply affine_map<(d0) -> (d0 * 4)>(%28#2)
%30 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%28#2)[%4]
%subview_9 = memref.subview %collapse_shape[%28#0, 0, %28#1, %29] [1, 1, 1, %30] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_10 = memref.subview %subview_8[%28#0, 0, %28#1, %29] [1, 1, 1, %30] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %30 step %c1 {
%subview_11 = memref.subview %subview_10[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_12 = memref.subview %subview_9[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%31 = vector.transfer_read %subview_12[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
vector.transfer_write %31, %subview_11[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After GPUReduceBankConflictsPass (iree-codegen-gpu-reduce-bank-conflicts) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c1 = arith.constant 1 : index
%c721 = arith.constant 721 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%thread_id_x = gpu.thread_id x
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%4 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
%subview = memref.subview %alloc[0, 0, 0, 0] [2, 1, 32, 16] [1, 1, 1, 1] : memref<2x1x32x20xf16, #gpu.address_space<workgroup>> to memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<16x20xf16, #gpu.address_space<workgroup>>
%subview_3 = memref.subview %alloc_2[0, 0] [16, 16] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>>
%alloc_4 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
%5:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
gpu.barrier
%6:2 = affine.delinearize_index %5#0 into (2, 2) : index, index
%subview_5 = memref.subview %alloc_4[%6#0, 0, %6#1, 0, 0, 0] [1, 1, 1, 16, 1, 16] [1, 1, 1, 1, 1, 1] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%7 = gpu.lane_id
%8 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%7)
%9 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%7)
%10 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%7, %6#1, %6#0)
%11:3 = affine.delinearize_index %10 into (2, 32, 4) : index, index, index
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%11#2)
%13 = affine.min affine_map<(d0) -> (2, d0)>(%11#0)
%14 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%13)
%15 = arith.cmpi eq, %14, %c0 : index
%16 = affine.min affine_map<(d0) -> (17, d0)>(%11#1)
%17 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%16)
%18 = arith.cmpi eq, %17, %c0 : index
%19 = arith.ori %18, %15 : i1
%subview_6 = memref.subview %subview[%11#0, 0, %11#1, %12] [1, 1, 1, 4] [1, 1, 1, 1] : memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>> to memref<1x1x1x4xf16, strided<[640, 640, 20, 1], offset: ?>, #gpu.address_space<workgroup>>
%20:2 = affine.delinearize_index %10 into (16, 16) : index, index
%subview_7 = memref.subview %subview_3[%20#0, %20#1] [1, 1] [1, 1] : memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
%21 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%20#1, %arg1)
%22 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%4, %21)
%23 = arith.cmpi eq, %22, %c0 : index
%expand_shape = memref.expand_shape %subview [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, strided<[640, 640, 320, 20, 16, 1]>, #gpu.address_space<workgroup>>
%expand_shape_8 = memref.expand_shape %subview_3 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, strided<[320, 20, 16, 1]>, #gpu.address_space<workgroup>>
%24 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%28 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %11#2)
%29 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%28)
%30 = arith.cmpi eq, %29, %c0 : index
%31 = arith.ori %30, %19 : i1
%32 = scf.if %31 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%subview_11 = memref.subview %0[%13, 0, 0, 0] [%14, 35, 35, 1281] [1, 1, 1, 1] : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloca = memref.alloca(%14, %17, %29) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg4 = %c0 to %14 step %c1 {
scf.for %arg5 = %c0 to %17 step %c1 {
scf.for %arg6 = %c0 to %29 step %c1 {
%51 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%28, %arg6)
%52 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %16, %28, %arg6)
%53 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %16, %28, %arg6)
%subview_14 = memref.subview %subview_11[%arg4, %52, %53, %51] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x35x35x1281xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_15 = memref.subview %alloca[%arg4, 0, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
%54 = vector.transfer_read %subview_14[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf16, strided<[1569225, 44835, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
vector.transfer_write %54, %subview_15[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<1x1x1x1xf16, strided<[?, ?, ?, 1], offset: ?>, #gpu.address_space<private>>
}
}
}
%alloca_12 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_12 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_13 = memref.subview %alloca_12[0, 0, 0, 0] [%14, 1, %17, %29] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_13 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_12 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%33 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
vector.transfer_write %33, %subview_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<1x1x1x4xf16, strided<[640, 640, 20, 1], offset: ?>, #gpu.address_space<workgroup>>
%34 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%20#0]
%35 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%34)
%36 = arith.cmpi eq, %35, %c0 : index
%37 = arith.ori %23, %36 : i1
scf.if %37 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_7 : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%51 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%21]
%subview_11 = memref.subview %1[%34, %51] [%35, %22] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_7 : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_12 = memref.subview %subview_7[0, 0] [%35, %22] [1, 1] : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_11, %subview_12 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%38 = vector.transfer_read %expand_shape[%6#0, %c0, %6#1, %8, %c0, %9], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, strided<[640, 640, 320, 20, 16, 1]>, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%39 = vector.transpose %38, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%40 = vector.transfer_read %expand_shape_8[%c0, %9, %c0, %8], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, strided<[320, 20, 16, 1]>, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%41 = vector.transpose %40, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%42 = vector.extract %39[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
%43 = vector.extract %41[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
%44 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
%45 = vector.shape_cast %42 : vector<1x4xf16> to vector<4xf16>
%46 = vector.shape_cast %43 : vector<1x4xf16> to vector<4xf16>
%47 = vector.shape_cast %44 : vector<4x1xf32> to vector<4xf32>
%48 = amdgpu.mfma %45 * %46 + %47 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%49 = vector.shape_cast %48 : vector<4xf32> to vector<4x1xf32>
%50 = vector.broadcast %49 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
scf.yield %50 : vector<1x1x1x1x4x1xf32>
}
%subview_9 = memref.subview %subview_5[0, 0, 0, %9, 0, %8] [1, 1, 1, 4, 1, 1] [1, 1, 1, 1, 1, 1] : memref<1x1x1x16x1x16xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%25 = vector.transpose %24, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
vector.transfer_write %25, %subview_9[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<1x1x1x4x1x1xf32, strided<[512, 512, 256, 16, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
gpu.barrier
%collapse_shape = memref.collapse_shape %alloc_4 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%subview_10 = memref.subview %2[0, %arg0, 0, %3] [2, 1, 17, %4] [1, 1, 1, 1] : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>> to memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%26 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%4)
%27 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%4)
gpu.barrier
scf.for %arg2 = %thread_id_x to %27 step %c256 {
%28:3 = affine.delinearize_index %arg2 into (2, 17, %26) : index, index, index
%29 = affine.apply affine_map<(d0) -> (d0 * 4)>(%28#2)
%30 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%28#2)[%4]
%subview_11 = memref.subview %collapse_shape[%28#0, 0, %28#1, %29] [1, 1, 1, %30] [1, 1, 1, 1] : memref<2x1x32x16xf32, #gpu.address_space<workgroup>> to memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%subview_12 = memref.subview %subview_10[%28#0, 0, %28#1, %29] [1, 1, 1, %30] [1, 1, 1, 1] : memref<2x1x17x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %30 step %c1 {
%subview_13 = memref.subview %subview_12[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_14 = memref.subview %subview_11[0, 0, 0, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x1x?xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>
%31 = vector.transfer_read %subview_14[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x1x1xf32, strided<[512, 512, 16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
vector.transfer_write %31, %subview_13[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<1x1x1x1xf32, strided<[370209, 21777, 1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c1 = arith.constant 1 : index
%c721 = arith.constant 721 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%thread_id_x = gpu.thread_id x
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
%subview = memref.subview %alloc[0, 0, 0, 0] [2, 1, 32, 16] [1, 1, 1, 1] : memref<2x1x32x20xf16, #gpu.address_space<workgroup>> to memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<16x20xf16, #gpu.address_space<workgroup>>
%subview_3 = memref.subview %alloc_2[0, 0] [16, 16] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>>
%alloc_4 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
%4:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
gpu.barrier
%5:2 = affine.delinearize_index %4#0 into (2, 2) : index, index
%6 = gpu.lane_id
%7 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%6)
%8 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%6)
%9 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%6, %5#1, %5#0)
%10:3 = affine.delinearize_index %9 into (2, 32, 4) : index, index, index
%11 = affine.min affine_map<(d0) -> (2, d0)>(%10#0)
%12 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%11)
%13 = arith.cmpi eq, %12, %c0 : index
%14 = affine.min affine_map<(d0) -> (17, d0)>(%10#1)
%15 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%14)
%16 = arith.cmpi eq, %15, %c0 : index
%17 = arith.ori %16, %13 : i1
%18:2 = affine.delinearize_index %9 into (16, 16) : index, index
%subview_5 = memref.subview %alloc_2[%18#0, %18#1] [1, 1] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
%19 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%18#1, %arg1)
%20 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%3, %19)
%21 = arith.cmpi eq, %20, %c0 : index
%expand_shape = memref.expand_shape %subview [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, strided<[640, 640, 320, 20, 16, 1]>, #gpu.address_space<workgroup>>
%expand_shape_6 = memref.expand_shape %subview_3 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, strided<[320, 20, 16, 1]>, #gpu.address_space<workgroup>>
%22 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%28 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %10#2)
%29 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%28)
%30 = arith.cmpi eq, %29, %c0 : index
%31 = arith.ori %30, %17 : i1
%32 = scf.if %31 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%alloca = memref.alloca(%12, %15, %29) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg4 = %c0 to %12 step %c1 {
scf.for %arg5 = %c0 to %15 step %c1 {
scf.for %arg6 = %c0 to %29 step %c1 {
%52 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%11, %arg4]
%53 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %14, %28, %arg6)
%54 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %14, %28, %arg6)
%55 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%28, %arg6)
%56 = vector.transfer_read %0[%52, %53, %54, %55], %cst_0 {in_bounds = [true, true, true, true]} : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
vector.transfer_write %56, %alloca[%arg4, %c0, %arg5, %arg6] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<?x1x?x?xf16, #gpu.address_space<private>>
}
}
}
%alloca_7 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_7 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_8 = memref.subview %alloca_7[0, 0, 0, 0] [%12, 1, %15, %29] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_8 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_7 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%33 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
%34 = affine.apply affine_map<(d0) -> (d0 * 4)>(%10#2)
vector.transfer_write %33, %alloc[%10#0, %c0, %10#1, %34] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
%35 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%18#0]
%36 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%35)
%37 = arith.cmpi eq, %36, %c0 : index
%38 = arith.ori %21, %37 : i1
scf.if %38 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_5 : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%52 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%19]
%subview_7 = memref.subview %1[%35, %52] [%36, %20] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_5 : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_8 = memref.subview %alloc_2[%18#0, %18#1] [%36, %20] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_7, %subview_8 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%39 = vector.transfer_read %expand_shape[%5#0, %c0, %5#1, %7, %c0, %8], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, strided<[640, 640, 320, 20, 16, 1]>, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%40 = vector.transpose %39, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%41 = vector.transfer_read %expand_shape_6[%c0, %8, %c0, %7], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, strided<[320, 20, 16, 1]>, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%42 = vector.transpose %41, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%43 = vector.extract %40[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
%44 = vector.extract %42[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
%45 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
%46 = vector.shape_cast %43 : vector<1x4xf16> to vector<4xf16>
%47 = vector.shape_cast %44 : vector<1x4xf16> to vector<4xf16>
%48 = vector.shape_cast %45 : vector<4x1xf32> to vector<4xf32>
%49 = amdgpu.mfma %46 * %47 + %48 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%50 = vector.shape_cast %49 : vector<4xf32> to vector<4x1xf32>
%51 = vector.broadcast %50 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
scf.yield %51 : vector<1x1x1x1x4x1xf32>
}
%23 = vector.transpose %22, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
%24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%6)
%25 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%6)
vector.transfer_write %23, %alloc_4[%5#0, %c0, %5#1, %24, %c0, %25] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
gpu.barrier
%collapse_shape = memref.collapse_shape %alloc_4 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%26 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%3)
%27 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%3)
gpu.barrier
scf.for %arg2 = %thread_id_x to %27 step %c256 {
%28:3 = affine.delinearize_index %arg2 into (2, 17, %26) : index, index, index
%29 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%28#2)[%3]
scf.for %arg3 = %c0 to %29 step %c1 {
%30 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%28#2)[%arg3]
%31 = vector.transfer_read %collapse_shape[%28#0, %c0, %28#1, %30], %cst {in_bounds = [true, true, true, true]} : memref<2x1x32x16xf32, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
%32 = affine.apply affine_map<(d0, d1)[s0] -> (d0 * 16 + d1 * 4 + s0)>(%arg1, %28#2)[%arg3]
vector.transfer_write %31, %2[%28#0, %arg0, %28#1, %32] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
}
}
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c1 = arith.constant 1 : index
%c721 = arith.constant 721 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%thread_id_x = gpu.thread_id x
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
%subview = memref.subview %alloc[0, 0, 0, 0] [2, 1, 32, 16] [1, 1, 1, 1] : memref<2x1x32x20xf16, #gpu.address_space<workgroup>> to memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<16x20xf16, #gpu.address_space<workgroup>>
%subview_3 = memref.subview %alloc_2[0, 0] [16, 16] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>>
%alloc_4 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
%4:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
gpu.barrier
%5:2 = affine.delinearize_index %4#0 into (2, 2) : index, index
%6 = gpu.lane_id
%7 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%6)
%8 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%6)
%9 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%6, %5#1, %5#0)
%10:3 = affine.delinearize_index %9 into (2, 32, 4) : index, index, index
%11 = affine.min affine_map<(d0) -> (2, d0)>(%10#0)
%12 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%11)
%13 = arith.cmpi eq, %12, %c0 : index
%14 = affine.min affine_map<(d0) -> (17, d0)>(%10#1)
%15 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%14)
%16 = arith.cmpi eq, %15, %c0 : index
%17 = arith.ori %16, %13 : i1
%18:2 = affine.delinearize_index %9 into (16, 16) : index, index
%subview_5 = memref.subview %alloc_2[%18#0, %18#1] [1, 1] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
%19 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%18#1, %arg1)
%20 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%3, %19)
%21 = arith.cmpi eq, %20, %c0 : index
%expand_shape = memref.expand_shape %subview [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, strided<[640, 640, 320, 20, 16, 1]>, #gpu.address_space<workgroup>>
%expand_shape_6 = memref.expand_shape %subview_3 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, strided<[320, 20, 16, 1]>, #gpu.address_space<workgroup>>
%22 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%28 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %10#2)
%29 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%28)
%30 = arith.cmpi eq, %29, %c0 : index
%31 = arith.ori %30, %17 : i1
%32 = scf.if %31 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%alloca = memref.alloca(%12, %15, %29) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg4 = %c0 to %12 step %c1 {
scf.for %arg5 = %c0 to %15 step %c1 {
scf.for %arg6 = %c0 to %29 step %c1 {
%52 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%11, %arg4]
%53 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %14, %28, %arg6)
%54 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %14, %28, %arg6)
%55 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%28, %arg6)
%56 = vector.transfer_read %0[%52, %53, %54, %55], %cst_0 {in_bounds = [true, true, true, true]} : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
vector.transfer_write %56, %alloca[%arg4, %c0, %arg5, %arg6] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<?x1x?x?xf16, #gpu.address_space<private>>
}
}
}
%alloca_7 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_7 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_8 = memref.subview %alloca_7[0, 0, 0, 0] [%12, 1, %15, %29] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_8 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_7 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%33 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
%34 = affine.apply affine_map<(d0) -> (d0 * 4)>(%10#2)
vector.transfer_write %33, %alloc[%10#0, %c0, %10#1, %34] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
%35 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%18#0]
%36 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%35)
%37 = arith.cmpi eq, %36, %c0 : index
%38 = arith.ori %21, %37 : i1
scf.if %38 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_5 : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
} else {
%52 = affine.apply affine_map<(d0)[s0] -> (d0 * 16 + s0)>(%arg1)[%19]
%subview_7 = memref.subview %1[%35, %52] [%36, %20] [1, 1] : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill ins(%cst_0 : f16) outs(%subview_5 : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>)
%subview_8 = memref.subview %alloc_2[%18#0, %18#1] [%36, %20] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<?x?xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_7, %subview_8 : memref<?x?xf16, strided<[1281, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x?xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
}
gpu.barrier
%39 = vector.transfer_read %expand_shape[%5#0, %c0, %5#1, %7, %c0, %8], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<2x1x2x16x1x16xf16, strided<[640, 640, 320, 20, 16, 1]>, #gpu.address_space<workgroup>>, vector<1x1x1x1x1x4xf16>
%40 = vector.transpose %39, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x1x4xf16> to vector<1x1x1x1x1x4xf16>
%41 = vector.transfer_read %expand_shape_6[%c0, %8, %c0, %7], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x16x1x16xf16, strided<[320, 20, 16, 1]>, #gpu.address_space<workgroup>>, vector<1x4x1x1xf16>
%42 = vector.transpose %41, [0, 2, 3, 1] : vector<1x4x1x1xf16> to vector<1x1x1x4xf16>
%43 = vector.extract %40[0, 0, 0, 0] : vector<1x4xf16> from vector<1x1x1x1x1x4xf16>
%44 = vector.extract %42[0, 0] : vector<1x4xf16> from vector<1x1x1x4xf16>
%45 = vector.extract %arg3[0, 0, 0, 0] : vector<4x1xf32> from vector<1x1x1x1x4x1xf32>
%46 = vector.shape_cast %43 : vector<1x4xf16> to vector<4xf16>
%47 = vector.shape_cast %44 : vector<1x4xf16> to vector<4xf16>
%48 = vector.shape_cast %45 : vector<4x1xf32> to vector<4xf32>
%49 = amdgpu.mfma %46 * %47 + %48 {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
%50 = vector.shape_cast %49 : vector<4xf32> to vector<4x1xf32>
%51 = vector.broadcast %50 : vector<4x1xf32> to vector<1x1x1x1x4x1xf32>
scf.yield %51 : vector<1x1x1x1x4x1xf32>
}
%23 = vector.transpose %22, [0, 1, 2, 4, 3, 5] : vector<1x1x1x1x4x1xf32> to vector<1x1x1x4x1x1xf32>
%24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%6)
%25 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%6)
vector.transfer_write %23, %alloc_4[%5#0, %c0, %5#1, %24, %c0, %25] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x4x1x1xf32>, memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
gpu.barrier
%collapse_shape = memref.collapse_shape %alloc_4 [[0], [1], [2, 3], [4, 5]] : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>> into memref<2x1x32x16xf32, #gpu.address_space<workgroup>>
%26 = affine.apply affine_map<(d0) -> (d0 ceildiv 4)>(%3)
%27 = affine.apply affine_map<(d0) -> ((d0 ceildiv 4) * 34)>(%3)
gpu.barrier
scf.for %arg2 = %thread_id_x to %27 step %c256 {
%28:3 = affine.delinearize_index %arg2 into (2, 17, %26) : index, index, index
%29 = affine.min affine_map<(d0)[s0] -> (d0 * -4 + s0, 4)>(%28#2)[%3]
scf.for %arg3 = %c0 to %29 step %c1 {
%30 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%28#2)[%arg3]
%31 = vector.transfer_read %collapse_shape[%28#0, %c0, %28#1, %30], %cst {in_bounds = [true, true, true, true]} : memref<2x1x32x16xf32, #gpu.address_space<workgroup>>, vector<1x1x1x1xf32>
%32 = affine.apply affine_map<(d0, d1)[s0] -> (d0 * 16 + d1 * 4 + s0)>(%arg1, %28#2)[%arg3]
vector.transfer_write %31, %2[%28#0, %arg0, %28#1, %32] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf32>, memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
}
}
gpu.barrier
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c1 = arith.constant 1 : index
%c721 = arith.constant 721 : index
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x4x1xf32>
%thread_id_x = gpu.thread_id x
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<11529x1281xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2x17x17x1281xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (17, 81) {
%3 = affine.min affine_map<(d0) -> (d0 * -16 + 1281, 16)>(%arg1)
%alloc = memref.alloc() : memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
%subview = memref.subview %alloc[0, 0, 0, 0] [2, 1, 32, 16] [1, 1, 1, 1] : memref<2x1x32x20xf16, #gpu.address_space<workgroup>> to memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<16x20xf16, #gpu.address_space<workgroup>>
%subview_3 = memref.subview %alloc_2[0, 0] [16, 16] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>>
%alloc_4 = memref.alloc() : memref<2x1x2x16x1x16xf32, #gpu.address_space<workgroup>>
%4:2 = affine.delinearize_index %thread_id_x into (4, 64) : index, index
gpu.barrier
%5:2 = affine.delinearize_index %4#0 into (2, 2) : index, index
%6 = gpu.lane_id
%7 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%6)
%8 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>(%6)
%9 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 64 + d2 * 128)>(%6, %5#1, %5#0)
%10:3 = affine.delinearize_index %9 into (2, 32, 4) : index, index, index
%11 = affine.min affine_map<(d0) -> (2, d0)>(%10#0)
%12 = affine.min affine_map<(d0) -> (-d0 + 2, 1)>(%11)
%13 = arith.cmpi eq, %12, %c0 : index
%14 = affine.min affine_map<(d0) -> (17, d0)>(%10#1)
%15 = affine.min affine_map<(d0) -> (-d0 + 17, 1)>(%14)
%16 = arith.cmpi eq, %15, %c0 : index
%17 = arith.ori %16, %13 : i1
%18:2 = affine.delinearize_index %9 into (16, 16) : index, index
%subview_5 = memref.subview %alloc_2[%18#0, %18#1] [1, 1] [1, 1] : memref<16x20xf16, #gpu.address_space<workgroup>> to memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
%19 = affine.min affine_map<(d0, d1) -> (d1 * -16 + 1281, 16, d0)>(%18#1, %arg1)
%20 = affine.min affine_map<(d0, d1) -> (1, d0 - d1)>(%3, %19)
%21 = arith.cmpi eq, %20, %c0 : index
%expand_shape = memref.expand_shape %subview [[0], [1], [2, 3], [4, 5]] output_shape [2, 1, 2, 16, 1, 16] : memref<2x1x32x16xf16, strided<[640, 640, 20, 1]>, #gpu.address_space<workgroup>> into memref<2x1x2x16x1x16xf16, strided<[640, 640, 320, 20, 16, 1]>, #gpu.address_space<workgroup>>
%expand_shape_6 = memref.expand_shape %subview_3 [[0, 1], [2, 3]] output_shape [1, 16, 1, 16] : memref<16x16xf16, strided<[20, 1]>, #gpu.address_space<workgroup>> into memref<1x16x1x16xf16, strided<[320, 20, 16, 1]>, #gpu.address_space<workgroup>>
%22 = scf.for %arg2 = %c0 to %c721 step %c1 iter_args(%arg3 = %cst_1) -> (vector<1x1x1x1x4x1xf32>) {
gpu.barrier
%26 = affine.min affine_map<(d0, d1) -> (11529, d0 * 16 + d1 * 4)>(%arg2, %10#2)
%27 = affine.min affine_map<(d0) -> (-d0 + 11529, 4)>(%26)
%28 = arith.cmpi eq, %27, %c0 : index
%29 = arith.ori %28, %17 : i1
%30 = scf.if %29 -> (memref<1x1x1x4xf16, #gpu.address_space<private>>) {
%alloca = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>) {
^bb0(%out: f16):
linalg.yield %cst_0 : f16
}
scf.yield %alloca : memref<1x1x1x4xf16, #gpu.address_space<private>>
} else {
%alloca = memref.alloca(%12, %15, %27) : memref<?x1x?x?xf16, #gpu.address_space<private>>
scf.for %arg4 = %c0 to %12 step %c1 {
scf.for %arg5 = %c0 to %15 step %c1 {
scf.for %arg6 = %c0 to %27 step %c1 {
%50 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%11, %arg4]
%51 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (((d0 + d1 * 17 + d2) floordiv 17) * 2 + (d3 + d4) floordiv 3843)>(%arg5, %arg0, %14, %26, %arg6)
%52 = affine.apply affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d1 * 34 + d2 * 2 - ((d0 + d1 * 17 + d2) floordiv 17) * 34 + ((d3 + d4) mod 3843) floordiv 1281)>(%arg5, %arg0, %14, %26, %arg6)
%53 = affine.apply affine_map<(d0, d1) -> ((d0 + d1) mod 1281)>(%26, %arg6)
%54 = vector.transfer_read %0[%50, %51, %52, %53], %cst_0 {in_bounds = [true, true, true, true]} : memref<2x35x35x1281xf16, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1xf16>
vector.transfer_write %54, %alloca[%arg4, %c0, %arg5, %arg6] {in_bounds = [true, true, true, true]} : vector<1x1x1x1xf16>, memref<?x1x?x?xf16, #gpu.address_space<private>>
}
}
}
%alloca_7 = memref.alloca() : memref<1x1x1x4xf16, #gpu.address_space<private>>
linalg.fill ins(%cst_0 : f16) outs(%alloca_7 : memref<1x1x1x4xf16, #gpu.address_space<private>>)
%subview_8 = memref.subview %alloca_7[0, 0, 0, 0] [%12, 1, %15, %27] [1, 1, 1, 1] : memref<1x1x1x4xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
memref.copy %alloca, %subview_8 : memref<?x1x?x?xf16, #gpu.address_space<private>> to memref<?x1x?x?xf16, strided<[4, 4, 4, 1]>, #gpu.address_space<private>>
scf.yield %alloca_7 : memref<1x1x1x4xf16, #gpu.address_space<private>>
}
%31 = vector.transfer_read %30[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x1x1x4xf16, #gpu.address_space<private>>, vector<1x1x1x4xf16>
%32 = affine.apply affine_map<(d0) -> (d0 * 4)>(%10#2)
vector.transfer_write %31, %alloc[%10#0, %c0, %10#1, %32] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf16>, memref<2x1x32x20xf16, #gpu.address_space<workgroup>>
%33 = affine.min affine_map<(d0)[s0] -> (11529, d0 * 16 + s0)>(%arg2)[%18#0]
%34 = affine.min affine_map<(d0) -> (-d0 + 11529, 1)>(%33)
%35 = arith.cmpi eq, %34, %c0 : index
%36 = arith.ori %21, %35 : i1
scf.if %36 {
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%subview_5 : memref<1x1xf16, strided<[20, 1], offset: ?>, #gpu.addre
View raw

(Sorry about that, but we can’t show files that are this big right now.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment